{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pinecone-io/examples/blob/master/learn/generation/structured-data/vectorizing-structured-data.ipynb) [![Open nbviewer](https://raw.githubusercontent.com/pinecone-io/examples/master/assets/nbviewer-shield.svg)](https://nbviewer.org/github/pinecone-io/examples/blob/master/learn/generation/structured-data/vectorizing-structured-data.ipynb)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "4Hmy_-yTOWjP"
      },
      "source": [
        "# Setup\n",
        "\n",
        "Install the following libraries to work with this notebook.\n",
        "\n",
        "Note: You will need two API keys to run this notebook: a [Pinecone](https://www.pinecone.io/) serverless API key, which you can get at app.pinecone.io after signing up for an account, and an OpenAI API key, which you can get at [OpenAI](https://openai.com/blog/openai-api).\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "4LslX1iX1XlV",
        "outputId": "61cf9950-4011-4a19-d13e-e88c6a59abfb"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Python 3.11.5\n"
          ]
        }
      ],
      "source": [
        "# This notebook runs on Python version:\n",
        "!python3 --version"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "U2aBWr00P_63",
        "outputId": "c682ac44-c94c-41a6-8e33-8e0908eb0b49"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\n",
            "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.2.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n",
            "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
          ]
        }
      ],
      "source": [
        "# Installs, 1\n",
        "!pip install -qU \\\n",
        "    \"pinecone-client[grpc]\"==3.2.1 \\\n",
        "    \"unstructured[pdf]\"==0.12.4 \\\n",
        "    langchain==0.1.9 \\\n",
        "    llama-index==0.10.23 \\\n",
        "    llama-index-vector-stores-pinecone==0.1.4 \\\n",
        "    pillow==10.0.0 \\\n",
        "    poppler-utils==0.1.0 \\\n",
        "    pytesseract==0.3.10"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "5pUgKiRiGPxh"
      },
      "outputs": [],
      "source": [
        "# Import libs you'll need:\n",
        "import json\n",
        "import os\n",
        "import re\n",
        "from typing import Any\n",
        "import requests\n",
        "\n",
        "from bs4 import BeautifulSoup, ResultSet\n",
        "from copy import deepcopy\n",
        "from IPython.display import HTML, display\n",
        "import pandas as pd\n",
        "from pathlib import Path\n",
        "from pinecone import ServerlessSpec\n",
        "from pinecone.grpc import PineconeGRPC\n",
        "\n",
        "\n",
        "from langchain.document_loaders import TextLoader\n",
        "from llama_index.core.indices.vector_store.base import VectorStoreIndex\n",
        "from llama_index.core.readers import download_loader\n",
        "from llama_index.core.ingestion.pipeline import IngestionPipeline\n",
        "from llama_index.core.node_parser import SemanticSplitterNodeParser\n",
        "from llama_index.core.query_engine import RetrieverQueryEngine\n",
        "from llama_index.core.retrievers import VectorIndexRetriever\n",
        "from llama_index.core.schema import Document, TransformComponent\n",
        "from llama_index.embeddings.openai import OpenAIEmbedding\n",
        "from llama_index.readers.file import PDFReader\n",
        "from llama_index.vector_stores.pinecone import PineconeVectorStore\n",
        "from unstructured.partition.pdf import partition_pdf\n",
        "\n",
        "# If you run into issues with LlamaIndex and LLM or VectorStore, run this command in a new cell:\n",
        "# !pip install llama-index --upgrade --no-cache-dir --force-reinstall"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 10,
      "metadata": {
        "id": "bTxuyuUmEIbM"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# The following simply makes print statements wrap text in Google Colab.\n",
        "def set_css():\n",
        "  display(HTML('''\n",
        "  <style>\n",
        "    pre {\n",
        "        white-space: pre-wrap;\n",
        "    }\n",
        "  </style>\n",
        "  '''))\n",
        "\n",
        "get_ipython().events.register('pre_run_cell', set_css)\n",
        "\n",
        "# This will ensure wrapped line are also displayed within Pandas dataframes\n",
        "pd.set_option('display.max_colwidth', 400)\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "vXSDSzAlR-DZ"
      },
      "source": [
        "# Table extraction with [Unstructured](https://unstructured-io.github.io/unstructured/index.html)\n",
        "\n",
        "You will start by extracting embedded tables from a PDF using `Unstructured`. The strategy in this section largely follows the one outlined in [this blog post](https://unstructured.io/blog/mastering-table-extraction-revolutionize-your-earnings-reports-analysis-with-ai) by `Unstructured`.\n",
        "\n",
        "Note the following:\n",
        "- PDFs need the `hi_res` strategy parameter.\n",
        "- You will use [`\"yolox\"`](https://unstructured-io.github.io/unstructured/best_practices/models.html), a table-specific ML model for extracting embedded tables from PDFs.\n",
        "- You will set the `infer_table_structure` parameter to `True`, as per `Unstructured`'s instructions for using [`partition_pdf`](https://unstructured-io.github.io/unstructured/best_practices/table_extraction_pdf.html#method-1-using-partition-pdf).\n",
        "\n",
        "The PDF you'll be using is [Explain then Rank: Scale Calibration of Neural Rankers Using Natural Language Explanations from Large Language Models](https://arxiv.org/pdf/2402.12276.pdf). It has already been [uploaded to Github](https://github.com/pinecone-io/examples/tree/master/learn/generation/semi-structured-data) for easy access."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 11,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "6sfvlW7JM0GL",
        "outputId": "1d3d967d-2270-4d04-c866-e41b9c7895fe"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "def download_from_github(gh_dir: str, file_name: str):\n",
        "    \"\"\"\n",
        "    Download file from Github.\n",
        "\n",
        "    :param gh_dir: Github directory that houses file,\n",
        "        e.g.https://github.com/pinecone-io/examples/blob/master/learn/generation/structured-data/\n",
        "\n",
        "        Note trailing \"/\".\n",
        "\n",
        "    :param file_name: Name of file (including file extension) you want to download.\n",
        "    \"\"\"\n",
        "    # Convert GitHub URL to raw content URL\n",
        "    raw_url = gh_dir.replace(\"https://github.com/\", \"https://raw.githubusercontent.com/\").replace(\"/blob\", \"\") + file_name\n",
        "\n",
        "    # Use requests to download the file\n",
        "    response = requests.get(raw_url)\n",
        "\n",
        "    # Check if the request was successful\n",
        "    if response.status_code == 200:\n",
        "        # Write the content to a file\n",
        "        with open(file_name, 'wb') as file:\n",
        "            file.write(response.content)\n",
        "        print(f\"File '{file_name}' downloaded successfully.\")\n",
        "    else:\n",
        "        print(f\"Failed to download the file. Status code: {response.status_code}\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 12,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "id": "0ksU9QbPNj1Q",
        "outputId": "42e57d1b-bfff-464d-e944-4ac985a72b9b"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "File 'scale-calibration-of-neural-rankers.pdf' downloaded successfully.\n"
          ]
        }
      ],
      "source": [
        "# Download file from Github\n",
        "github_dir = \"https://github.com/pinecone-io/examples/blob/master/learn/generation/structured-data/\"\n",
        "filename = \"scale-calibration-of-neural-rankers.pdf\"\n",
        "\n",
        "download_from_github(github_dir, filename)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "wUwexkoPOaNy",
        "outputId": "aa2a29de-fc61-4276-cc02-a746b917781a"
      },
      "outputs": [],
      "source": [
        "# Note: this cell takes ~1-2mins to run in Colab.\n",
        "elements = partition_pdf(\n",
        "filename=\"scale-calibration-of-neural-rankers.pdf\",\n",
        "strategy=\"hi_res\",\n",
        "infer_table_structure=True,\n",
        "model_name=\"yolox_quantized\"  # A bit faster than plain Yolox model\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 15,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "Vk85Ug4MYVh2",
        "outputId": "a11535e7-4f00-45c4-8633-78b8981be474"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# Save table elements\n",
        "tables = [el for el in elements if el.category == \"Table\"]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 16,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "hwtbt6sZP2XB",
        "outputId": "7e5b2300-d464-47a1-8bdc-217c939eb76f"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# You are going to save the extracted Table elements to a .txt file that will be used by BeautifulSoup downstream.\n",
        "TEXT_FILE = \"scale-calibration-of-neural-rankers.txt\"\n",
        "\n",
        "# Save HTML to .txt file\n",
        "with open(TEXT_FILE, 'w') as output_file:\n",
        "        for t in tables:\n",
        "            content = t.metadata.text_as_html\n",
        "            output_file.write(content + \"\\n\\n\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 17,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "3LWhOP20P8bO",
        "outputId": "332b1d72-b9a2-471e-c81f-6d18eb5028e2"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# You will now use LangChain to load your \"documents\" (i.e. your tables)\n",
        "loader = TextLoader(TEXT_FILE)\n",
        "documents = loader.load()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 18,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "O3p5Nv_6QKQ-",
        "outputId": "403ba460-6ea3-442e-c78d-92b56975df56"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# You will use BeautifulSoup to parse the HTML in your .txt file\n",
        "html_tables = BeautifulSoup(documents[0].page_content).select('table')  # documents is only of len 1"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 19,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "id": "lwEYtgRqQTP4",
        "outputId": "f6b24837-2bd5-4af1-90a9-59061d338804"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "You've extracted 5 tables from your PDF!\n"
          ]
        }
      ],
      "source": [
        "# Note the number of embedded tables you've extracted from the PDF..\n",
        "print(f'You\\'ve extracted {len(html_tables)} tables from your PDF!')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 20,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "jcRUuZQ2x04d",
        "outputId": "dad36c57-718e-4a1c-dc1f-e20902e22a4b"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# You will now extract structured data from the tables you've extracted from the PDF.\n",
        "\n",
        "def extract_cols_and_rows(tables: ResultSet) -> tuple[list, list]:\n",
        "    \"\"\"\n",
        "    Grab column headers and rows from table elements.\n",
        "\n",
        "    :param tables: extract table elements from HTML text.\n",
        "    :return: tuple containing extracted column headers and row data.\n",
        "    \"\"\"\n",
        "    headers = []\n",
        "    rows = []\n",
        "    # Iterate over each table\n",
        "    for table in tables:\n",
        "        # Extract headers\n",
        "        th = [th.text for th in table.find_all('th')]\n",
        "        headers.append(th)\n",
        "        # Extract rows\n",
        "        tr_td = []\n",
        "        for tr in table.find_all('tr'):\n",
        "            row = [td.text for td in tr.find_all('td')]\n",
        "            if row:  # Skip empty rows\n",
        "                tr_td.append(row)\n",
        "            rows.append(row)\n",
        "    return headers, rows"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 21,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "eUNKbIv2Z-oq",
        "outputId": "29430704-32ab-4b3d-bd39-8be001ad4c9f"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "headers, rows = extract_cols_and_rows(html_tables)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "2lBL9SuT5xso"
      },
      "source": [
        "You will only be playing with two tables in this example notebook, simply because it's easier than dealing with all 5.\n",
        "\n",
        "The two you will be using for your experiments are \"Table 1\" and \"Table 2\" in the PDF.\n",
        "\n",
        "Since nothing in table extraction is perfect (yet), you'll have to do some massaging of the extracted headers and rows to get them in the perfect, structured format."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "kPc_8OV5auez"
      },
      "source": [
        "#### Table 1 Construction"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 22,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "XzvEf2XU0xBC",
        "outputId": "40b608a3-8004-4f38-df2e-28013a6c195d"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# Through manual investigation, you find their headers and rows in the \"headers\" and \"rows\" variables\n",
        "t1_headers = headers[1]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 23,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "id": "65qaKp5dbHm0",
        "outputId": "1a7cd3b4-6e6a-4252-eedb-8bf90b84c4ed"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/plain": [
              "['Metric', 'TREC-DL', 'NTCIR-14']"
            ]
          },
          "execution_count": 23,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "t1_headers"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 33,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "r5doBtbi0xFA",
        "outputId": "bf009fd2-e841-43f6-e586-831f52b931d3"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# You find the correct rows\n",
        "t1_rows = rows[15:21]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 34,
      "metadata": {},
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/plain": [
              "[['# Queries (Train/Val/Test)', '| 97/53/67', '48/16/16'],\n",
              " ['Avg. # docs per query', '282.7', '345.3'],\n",
              " ['Levels of relevance', '4', '5'],\n",
              " ['Label dist. (low to high)', '58/22/14/6', '\u201448/23/17/8/3'],\n",
              " ['Avg. query length', '8.0', '22.0'],\n",
              " ['Avg. doc. length', '70.9', '493.2']]"
            ]
          },
          "execution_count": 34,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "t1_rows"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 35,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "PpcOfSuRa9e_",
        "outputId": "2a90f384-02a4-4ffe-cebd-8442de6f58b0"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# You make your headers and rows into a dataframe for easy parsing downstream.\n",
        "df1 = pd.DataFrame(data=t1_rows, columns=t1_headers)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 36,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 238
        },
        "id": "tTBr9CNSbZjM",
        "outputId": "1f3da43e-3b14-41de-c858-bc622ad38186"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>Metric</th>\n",
              "      <th>TREC-DL</th>\n",
              "      <th>NTCIR-14</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td># Queries (Train/Val/Test)</td>\n",
              "      <td>| 97/53/67</td>\n",
              "      <td>48/16/16</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>Avg. # docs per query</td>\n",
              "      <td>282.7</td>\n",
              "      <td>345.3</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>Levels of relevance</td>\n",
              "      <td>4</td>\n",
              "      <td>5</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>Label dist. (low to high)</td>\n",
              "      <td>58/22/14/6</td>\n",
              "      <td>\u201448/23/17/8/3</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>Avg. query length</td>\n",
              "      <td>8.0</td>\n",
              "      <td>22.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>5</th>\n",
              "      <td>Avg. doc. length</td>\n",
              "      <td>70.9</td>\n",
              "      <td>493.2</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "                       Metric     TREC-DL       NTCIR-14\n",
              "0  # Queries (Train/Val/Test)  | 97/53/67       48/16/16\n",
              "1       Avg. # docs per query       282.7          345.3\n",
              "2         Levels of relevance           4              5\n",
              "3   Label dist. (low to high)  58/22/14/6  \u201448/23/17/8/3\n",
              "4           Avg. query length         8.0           22.0\n",
              "5            Avg. doc. length        70.9          493.2"
            ]
          },
          "execution_count": 36,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# Take a look at your constructed table:\n",
        "df1"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "q9Iafm3Yayqu"
      },
      "source": [
        "#### Table 2 Construction"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 37,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 303
        },
        "id": "UcpvwECAaUYM",
        "outputId": "be5b58f8-137c-4a27-c042-58d82c1d18f1"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/plain": [
              "['Metric',\n",
              " 'Ranking',\n",
              " 'Calibration',\n",
              " 'Ranking',\n",
              " 'Calibration',\n",
              " '',\n",
              " 'nDCG',\n",
              " 'nDCG@10',\n",
              " 'CB-ECE',\n",
              " 'ECE',\n",
              " 'MSE |',\n",
              " 'nDCG',\n",
              " 'nDCG@10',\n",
              " 'CB-ECE',\n",
              " 'ECE',\n",
              " 'MSE']"
            ]
          },
          "execution_count": 37,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# Headers[2] is not super well formed (likely bc the headers are nested), so you will manually overwrite these in the next cell\n",
        "headers[2]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 38,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "Be_qccioagkS",
        "outputId": "5efa69d4-b5da-4867-ac0f-2cd12fe4c747"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# Manually concatenate the nested headers to maintain semantic relations in a single header per column\n",
        "# This is a design choice that you will need to discuss with stakeholders. You can structure the data extracted from your tables\n",
        "# in any way that makes sense to you.\n",
        "\n",
        "t2_headers = ['Method',\n",
        "              'TREC-ndcg',\n",
        "              'TREC-ndcg@10',\n",
        "              'TREC-CB-ECE',\n",
        "              'TREC-ECE',\n",
        "              'TREC-MSE',\n",
        "              'NTCIR-ndcg',\n",
        "              'NTCIR-ndcg@10',\n",
        "              'NTCIR-CB-ECE',\n",
        "              'NTCIR-ECE',\n",
        "              'NTCIR-MSE']"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 41,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "X77R-pQp1i4g",
        "outputId": "e7c1eced-dfb7-48ba-e543-e4ac624cf01a"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "t2_rows = rows[21:-9]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 43,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "4-_WAkHpV6Qx",
        "outputId": "f04ce05b-4b5d-45a9-d043-834ab4ec63db"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "df2 = pd.DataFrame(data=t2_rows, columns=t2_headers)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 44,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "hlmFDHtCayEt",
        "outputId": "dfd2f63e-6776-4b19-a8c9-3d6c36fb663e"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# Our extraction technique did not grab the index unlabeled column in Table 2, which contains the\n",
        "# classification categories each of the listed methods fall into (A, B, C...)\n",
        "\n",
        "# So, add this in:\n",
        "categories = ['A', 'B', 'C', 'C', 'D', 'E', 'F', 'F']  # Note we have to duplicate some categories since the table has two values per row in some places\n",
        "\n",
        "df2.insert(1, 'Category', categories)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 45,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 474
        },
        "id": "lmpEAg4NV9lv",
        "outputId": "97db51db-2681-4111-cdbd-102bf3c3979a"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>Method</th>\n",
              "      <th>Category</th>\n",
              "      <th>TREC-ndcg</th>\n",
              "      <th>TREC-ndcg@10</th>\n",
              "      <th>TREC-CB-ECE</th>\n",
              "      <th>TREC-ECE</th>\n",
              "      <th>TREC-MSE</th>\n",
              "      <th>NTCIR-ndcg</th>\n",
              "      <th>NTCIR-ndcg@10</th>\n",
              "      <th>NTCIR-CB-ECE</th>\n",
              "      <th>NTCIR-ECE</th>\n",
              "      <th>NTCIR-MSE</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>Uncalibrated monoBERT</td>\n",
              "      <td>A</td>\n",
              "      <td>0.799</td>\n",
              "      <td>0.494</td>\n",
              "      <td>1.205</td>\n",
              "      <td>\u20140.320.-\u2014\u20140.773</td>\n",
              "      <td></td>\n",
              "      <td>| 0.735</td>\n",
              "      <td>0.337</td>\n",
              "      <td>1.757</td>\n",
              "      <td>0.799</td>\n",
              "      <td>1.824</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>Post hoc + monoBERT</td>\n",
              "      <td>B</td>\n",
              "      <td>0.799</td>\n",
              "      <td>0.494</td>\n",
              "      <td>1.141</td>\n",
              "      <td>0.125</td>\n",
              "      <td>0.684 |</td>\n",
              "      <td>0.735</td>\n",
              "      <td>0.337</td>\n",
              "      <td>1.624</td>\n",
              "      <td>0.457_\u2014\u00ab1.462</td>\n",
              "      <td></td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>Finetune monoBERT</td>\n",
              "      <td>C</td>\n",
              "      <td>0.776</td>\n",
              "      <td>0.422</td>\n",
              "      <td>1.093</td>\n",
              "      <td>0.221</td>\n",
              "      <td>\u00ab0.721 |</td>\n",
              "      <td>0.696</td>\n",
              "      <td>0.268</td>\n",
              "      <td>1.843</td>\n",
              "      <td>0.709</td>\n",
              "      <td>\u20181.874</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>Finetune BERT</td>\n",
              "      <td>C</td>\n",
              "      <td>0.738</td>\n",
              "      <td>0.327</td>\n",
              "      <td>1.253</td>\n",
              "      <td>0.266</td>\n",
              "      <td>~=\u2014-0.785 |</td>\n",
              "      <td>0.727</td>\n",
              "      <td>0.285</td>\n",
              "      <td>1.756</td>\n",
              "      <td>0.546</td>\n",
              "      <td>\u00ab1.416</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>LLM prompting w/ rubrics</td>\n",
              "      <td>D</td>\n",
              "      <td>0.786</td>\n",
              "      <td>0.457</td>\n",
              "      <td>1.000</td>\n",
              "      <td>1.246</td>\n",
              "      <td>2.137 |</td>\n",
              "      <td>0.728</td>\n",
              "      <td>0.328</td>\n",
              "      <td>1.2947</td>\n",
              "      <td>1.194</td>\n",
              "      <td>2.773</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>5</th>\n",
              "      <td>Post hoc + MC Sampling LLM</td>\n",
              "      <td>E</td>\n",
              "      <td>0.790</td>\n",
              "      <td>0.473</td>\n",
              "      <td>1.165</td>\n",
              "      <td>0.145</td>\n",
              "      <td>0.673</td>\n",
              "      <td>| 0.736</td>\n",
              "      <td>=~ 0.364\"</td>\n",
              "      <td>1.677</td>\n",
              "      <td>0.472</td>\n",
              "      <td>\u20181.540</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>6</th>\n",
              "      <td>Literal Explanation + BERT</td>\n",
              "      <td>F</td>\n",
              "      <td>0.815'</td>\n",
              "      <td>0.529\"</td>\n",
              "      <td>0.996\u00b0</td>\n",
              "      <td>0.067\"</td>\n",
              "      <td>0.602\" |</td>\n",
              "      <td>0.742</td>\n",
              "      <td>0.340</td>\n",
              "      <td>1.534\"</td>\n",
              "      <td>0.355</td>\n",
              "      <td>1.3307</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>7</th>\n",
              "      <td>Conditional Explanation + BERT</td>\n",
              "      <td>F</td>\n",
              "      <td>0.822</td>\n",
              "      <td>0.5347</td>\n",
              "      <td>0.862'</td>\n",
              "      <td>0.428</td>\n",
              "      <td>0.832 |</td>\n",
              "      <td>0.720</td>\n",
              "      <td>0.322</td>\n",
              "      <td>1.405'</td>\n",
              "      <td>0.2577</td>\n",
              "      <td>1.2907</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "                           Method Category TREC-ndcg TREC-ndcg@10 TREC-CB-ECE  \\\n",
              "0           Uncalibrated monoBERT        A     0.799        0.494       1.205   \n",
              "1             Post hoc + monoBERT        B     0.799        0.494       1.141   \n",
              "2               Finetune monoBERT        C     0.776        0.422       1.093   \n",
              "3                   Finetune BERT        C     0.738        0.327       1.253   \n",
              "4        LLM prompting w/ rubrics        D     0.786        0.457       1.000   \n",
              "5      Post hoc + MC Sampling LLM        E     0.790        0.473       1.165   \n",
              "6      Literal Explanation + BERT        F    0.815'       0.529\"      0.996\u00b0   \n",
              "7  Conditional Explanation + BERT        F     0.822       0.5347      0.862'   \n",
              "\n",
              "          TREC-ECE     TREC-MSE NTCIR-ndcg NTCIR-ndcg@10 NTCIR-CB-ECE  \\\n",
              "0  \u20140.320.-\u2014\u20140.773                 | 0.735         0.337        1.757   \n",
              "1            0.125      0.684 |      0.735         0.337        1.624   \n",
              "2            0.221     \u00ab0.721 |      0.696         0.268        1.843   \n",
              "3            0.266  ~=\u2014-0.785 |      0.727         0.285        1.756   \n",
              "4            1.246      2.137 |      0.728         0.328       1.2947   \n",
              "5            0.145        0.673    | 0.736     =~ 0.364\"        1.677   \n",
              "6           0.067\"     0.602\" |      0.742         0.340       1.534\"   \n",
              "7            0.428      0.832 |      0.720         0.322       1.405'   \n",
              "\n",
              "       NTCIR-ECE NTCIR-MSE  \n",
              "0          0.799     1.824  \n",
              "1  0.457_\u2014\u00ab1.462            \n",
              "2          0.709    \u20181.874  \n",
              "3          0.546    \u00ab1.416  \n",
              "4          1.194     2.773  \n",
              "5          0.472    \u20181.540  \n",
              "6          0.355    1.3307  \n",
              "7         0.2577    1.2907  "
            ]
          },
          "execution_count": 45,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# You can see some weirdness here w/missing decimal points and blank cells, so you will manually clean your dataframe below.\n",
        "df2"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000
        },
        "id": "_Qjns6YNWup0",
        "outputId": "e256d404-3d56-47c8-a126-c5c043524a8d"
      },
      "outputs": [],
      "source": [
        "# From PDF, you know the actual values these cells need to be, so set them here\n",
        "df2[\"TREC-ECE\"].iloc[0] = '0.320'\n",
        "df2[\"TREC-MSE\"].iloc[0] = '0.773'\n",
        "df2['NTCIR-ECE'].iloc[1] = '0.457'\n",
        "df2['NTCIR-MSE'].iloc[1] = '1.462'\n",
        "df2['TREC-CB-ECE'].iloc[0] = '1.205'\n",
        "df2['TREC-CB-ECE'].iloc[1] = '1.141'\n",
        "df2['TREC-CB-ECE'].iloc[5] = '1.165'"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 47,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 474
        },
        "id": "pyQbEPxL2XHA",
        "outputId": "f6532f9b-3f8c-4614-bba4-47eeb9321642"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>Method</th>\n",
              "      <th>Category</th>\n",
              "      <th>TREC-ndcg</th>\n",
              "      <th>TREC-ndcg@10</th>\n",
              "      <th>TREC-CB-ECE</th>\n",
              "      <th>TREC-ECE</th>\n",
              "      <th>TREC-MSE</th>\n",
              "      <th>NTCIR-ndcg</th>\n",
              "      <th>NTCIR-ndcg@10</th>\n",
              "      <th>NTCIR-CB-ECE</th>\n",
              "      <th>NTCIR-ECE</th>\n",
              "      <th>NTCIR-MSE</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>Uncalibrated monoBERT</td>\n",
              "      <td>A</td>\n",
              "      <td>0.799</td>\n",
              "      <td>0.494</td>\n",
              "      <td>1.205</td>\n",
              "      <td>0.320</td>\n",
              "      <td>0.773</td>\n",
              "      <td>| 0.735</td>\n",
              "      <td>0.337</td>\n",
              "      <td>1.757</td>\n",
              "      <td>0.799</td>\n",
              "      <td>1.824</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>Post hoc + monoBERT</td>\n",
              "      <td>B</td>\n",
              "      <td>0.799</td>\n",
              "      <td>0.494</td>\n",
              "      <td>1.141</td>\n",
              "      <td>0.125</td>\n",
              "      <td>0.684 |</td>\n",
              "      <td>0.735</td>\n",
              "      <td>0.337</td>\n",
              "      <td>1.624</td>\n",
              "      <td>0.457</td>\n",
              "      <td>1.462</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>Finetune monoBERT</td>\n",
              "      <td>C</td>\n",
              "      <td>0.776</td>\n",
              "      <td>0.422</td>\n",
              "      <td>1.093</td>\n",
              "      <td>0.221</td>\n",
              "      <td>\u00ab0.721 |</td>\n",
              "      <td>0.696</td>\n",
              "      <td>0.268</td>\n",
              "      <td>1.843</td>\n",
              "      <td>0.709</td>\n",
              "      <td>\u20181.874</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>Finetune BERT</td>\n",
              "      <td>C</td>\n",
              "      <td>0.738</td>\n",
              "      <td>0.327</td>\n",
              "      <td>1.253</td>\n",
              "      <td>0.266</td>\n",
              "      <td>~=\u2014-0.785 |</td>\n",
              "      <td>0.727</td>\n",
              "      <td>0.285</td>\n",
              "      <td>1.756</td>\n",
              "      <td>0.546</td>\n",
              "      <td>\u00ab1.416</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>LLM prompting w/ rubrics</td>\n",
              "      <td>D</td>\n",
              "      <td>0.786</td>\n",
              "      <td>0.457</td>\n",
              "      <td>1.000</td>\n",
              "      <td>1.246</td>\n",
              "      <td>2.137 |</td>\n",
              "      <td>0.728</td>\n",
              "      <td>0.328</td>\n",
              "      <td>1.2947</td>\n",
              "      <td>1.194</td>\n",
              "      <td>2.773</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>5</th>\n",
              "      <td>Post hoc + MC Sampling LLM</td>\n",
              "      <td>E</td>\n",
              "      <td>0.790</td>\n",
              "      <td>0.473</td>\n",
              "      <td>1.165</td>\n",
              "      <td>0.145</td>\n",
              "      <td>0.673</td>\n",
              "      <td>| 0.736</td>\n",
              "      <td>=~ 0.364\"</td>\n",
              "      <td>1.677</td>\n",
              "      <td>0.472</td>\n",
              "      <td>\u20181.540</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>6</th>\n",
              "      <td>Literal Explanation + BERT</td>\n",
              "      <td>F</td>\n",
              "      <td>0.815'</td>\n",
              "      <td>0.529\"</td>\n",
              "      <td>0.996\u00b0</td>\n",
              "      <td>0.067\"</td>\n",
              "      <td>0.602\" |</td>\n",
              "      <td>0.742</td>\n",
              "      <td>0.340</td>\n",
              "      <td>1.534\"</td>\n",
              "      <td>0.355</td>\n",
              "      <td>1.3307</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>7</th>\n",
              "      <td>Conditional Explanation + BERT</td>\n",
              "      <td>F</td>\n",
              "      <td>0.822</td>\n",
              "      <td>0.5347</td>\n",
              "      <td>0.862'</td>\n",
              "      <td>0.428</td>\n",
              "      <td>0.832 |</td>\n",
              "      <td>0.720</td>\n",
              "      <td>0.322</td>\n",
              "      <td>1.405'</td>\n",
              "      <td>0.2577</td>\n",
              "      <td>1.2907</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "                           Method Category TREC-ndcg TREC-ndcg@10 TREC-CB-ECE  \\\n",
              "0           Uncalibrated monoBERT        A     0.799        0.494       1.205   \n",
              "1             Post hoc + monoBERT        B     0.799        0.494       1.141   \n",
              "2               Finetune monoBERT        C     0.776        0.422       1.093   \n",
              "3                   Finetune BERT        C     0.738        0.327       1.253   \n",
              "4        LLM prompting w/ rubrics        D     0.786        0.457       1.000   \n",
              "5      Post hoc + MC Sampling LLM        E     0.790        0.473       1.165   \n",
              "6      Literal Explanation + BERT        F    0.815'       0.529\"      0.996\u00b0   \n",
              "7  Conditional Explanation + BERT        F     0.822       0.5347      0.862'   \n",
              "\n",
              "  TREC-ECE     TREC-MSE NTCIR-ndcg NTCIR-ndcg@10 NTCIR-CB-ECE NTCIR-ECE  \\\n",
              "0    0.320        0.773    | 0.735         0.337        1.757     0.799   \n",
              "1    0.125      0.684 |      0.735         0.337        1.624     0.457   \n",
              "2    0.221     \u00ab0.721 |      0.696         0.268        1.843     0.709   \n",
              "3    0.266  ~=\u2014-0.785 |      0.727         0.285        1.756     0.546   \n",
              "4    1.246      2.137 |      0.728         0.328       1.2947     1.194   \n",
              "5    0.145        0.673    | 0.736     =~ 0.364\"        1.677     0.472   \n",
              "6   0.067\"     0.602\" |      0.742         0.340       1.534\"     0.355   \n",
              "7    0.428      0.832 |      0.720         0.322       1.405'    0.2577   \n",
              "\n",
              "  NTCIR-MSE  \n",
              "0     1.824  \n",
              "1     1.462  \n",
              "2    \u20181.874  \n",
              "3    \u00ab1.416  \n",
              "4     2.773  \n",
              "5    \u20181.540  \n",
              "6    1.3307  \n",
              "7    1.2907  "
            ]
          },
          "execution_count": 47,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# Great! You can leave the special chars, etc. They shouldn't matter too much.\n",
        "df2"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "SxgoRqamWeMF"
      },
      "source": [
        "# LlamaIndex and Pinecone\n",
        " Now that you have the 2 tables for experimentation, you will use [LlamaIndex](https://docs.llamaindex.ai/en/stable/) to turn the rest of the PDF (including the embedded tables, any diagrams, images, etc.) into [Documents](https://docs.llamaindex.ai/en/stable/module_guides/loading/documents_and_nodes/root.html) objects.\n",
        "\n",
        "This step is necessary because your earlier `documents` object, which you generated from your `.txt` file using `LangChain`, contains only your extracted *tables*, not the rest of the PDF's content.\n",
        "\n",
        "The main tools you'll use from LlamaIndex are as follows:\n",
        "- [`PDFReader`](https://github.com/run-llama/llama_index/blob/50806ba526dde4a054842394fe32e3880646fe6d/llama-index-legacy/llama_index/legacy/readers/file/docs_reader.py#L16) from LlamaHub\n",
        "- [`SemanticSplitterNodeParser`](https://docs.llamaindex.ai/en/stable/api/llama_index.core.node_parser.SemanticSplitterNodeParser.html#semanticsplitternodeparser), which splits a Document into Nodes, with each node being a group of semantically related sentences.\n",
        "- [`IngestionPipeline`](https://docs.llamaindex.ai/en/stable/module_guides/loading/ingestion_pipeline/root.html) to build an ETL flow that chunks your PDF, embeds it (i.e. vectorizes it), and then stores it in Pinecone in a specific [`namespace`](https://docs.pinecone.io/docs/namespaces).\n",
        "- [`RetrieverQueryEngine`](https://github.com/run-llama/llama_index/blob/v0.10.12/llama-index-core/llama_index/core/query_engine/retriever_query_engine.py#L27) for querying your LLM in the RAG pipeline you'll build.\n",
        "\n",
        "Learn more about using LlamaIndex with Pinecone on our [Integrations page](https://github.com/pinecone-io/examples/blob/master/learn/generation/llama-index/using-llamaindex-with-pinecone.ipynb)."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "qXxvbbNCf13y"
      },
      "source": [
        "## Load all PDF contents with LlamaIndex\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 48,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "hqAVtw_r0lH5",
        "outputId": "8ad396d4-6f85-4f32-c71a-670b2f8dd197"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# Read in your PDF\n",
        "\n",
        "loader = PDFReader()\n",
        "path = Path('scale-calibration-of-neural-rankers.pdf')\n",
        "ctrl_docs = loader.load_data(file=path)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 49,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 591
        },
        "id": "hHxHW8mDkkk9",
        "outputId": "475b639f-dcb3-4f1f-8442-3aba7cc7b4d8"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/plain": [
              "Document(id_='b375ddec-d6c1-4046-b99d-f6c48513c62b', embedding=None, metadata={'page_label': '7', 'file_name': 'scale-calibration-of-neural-rankers.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='Explain then Rank: Scale Calibration of Neural Rankers Using Natural Language Explanations from Large Language Models Conference\u201917, July 2017, Washington, DC, USA\\nTable 2: Ranking and scale calibration performance of baseline methods and our approaches on two scale calibration datasets\\nTREC and NTCIR. Note that lower is better with calibration metrics (CB-ECE, ECE and MSE). Statistically significant improve-\\nments over \u201cPlatt Scaling monoBERT\u201d are marked with\u2020.\\nCollection TREC NTCIR\\nMetricRanking Calibration Ranking Calibration\\nnDCG nDCG@10 CB-ECE ECE MSE nDCG nDCG@10 CB-ECE ECE MSE\\nA Uncalibrated monoBERT 0.799 0.494 1.205 0.320 0.773 0.735 0.337 1.757 0.799 1.824\\nB Post hoc + monoBERT 0.799 0.494 1.141 0.125 0.684 0.735 0.337 1.624 0.457 1.462\\nCFinetune monoBERT 0.776 0.422 1.093 0.221 0.721 0.696 0.268 1.843 0.709 1.874\\nFinetune BERT 0.738 0.327 1.253 0.266 0.785 0.727 0.285 1.756 0.546 1.416\\nD LLM prompting w/ rubrics 0.786 0.457 1.000\u20201.246 2.137 0.728 0.328 1.294\u20201.194 2.773\\nE Post hoc + MC Sampling LLM 0.790 0.473 1.165 0.145 0.673 0.736 0.364\u20201.677 0.472 1.540\\nFLiteral Explanation + BERT 0.815\u20200.529\u20200.996\u20200.067\u20200.602\u20200.742 0.340 1.534\u20200.355\u20201.330\u2020\\nConditional Explanation + BERT 0.822\u20200.534\u20200.862\u20200.428 0.832 0.720 0.322 1.405\u20200.257\u20201.290\u2020\\nMSE\\nSoftmax MultiObj\\nCalibrated Softmax\\nTraining Conditions0.6750.7000.7250.7500.7750.8000.8250.850nDCG\\n0.7100.746\\n0.7100.7380.8100.817\\n0.8100.815\\n0.8060.821\\n0.8060.809Ranking Performance\\nMSE\\nSoftmax MultiObj\\nCalibrated Softmax\\nTraining Conditions0.80.91.01.11.21.3CB-ECE1.2301.291\\n1.2311.253\\n0.9951.081\\n1.001 0.996\\n0.965\\n0.8640.965\\n0.879Scale Calibration PerformanceQuery + Document Literal Explanation Conditional Explanation\\nFigure 2: Ranking and scale calibration performance of the baseline (neural ranker taking query and documents) and NLE-based\\napproaches on TREC, using four different optimization objectives. NLE-based approaches consistently yield better ranking\\n(left) and calibration (right) performance.\\nThe training is conducted over a maximum of 10 epochs, selecting\\nthe best model based on validation set loss. To mitigate the impact\\nof randomness due to the limited dataset size in terms of query\\nnumbers, each experiment is run with 5 different random seeds. The\\nmetrics reported are averaged across these five runs and the statisti-\\ncal significance is determined using t-tests with Bonferroni correc-\\ntion [ 42] at the 95% confidence level. In relation to the components\\nand hyper-parameters used in Algorithm 1, we employ ROUGE-\\nL [19] as the text similarity function Sfollowing Quach et al . [31] .\\nThe similarity threshold \ud835\udf06is set to 0.35, with the sampling budget\\n\ud835\udc58\ud835\udc59(maximum number of responses) fixed at 20 and \ud835\udc58\ud835\udc60(maximumnumber of sentences in the meta NLE) at 30. The repository con-\\ntaining the source code, processed data, and detailed instructions\\nto facilitate the reproduction of the results reported in this study is\\navailable at https://github.com/PxYu/LLM-NLE-for-Calibration.\\n4.5 Results and Analysis\\nRQ1 . The central research question of this study is to determine if\\nNLEs generated by LLMs enhance the calibration and ranking per-\\nformance of neural rankers. We present the main evaluation results\\nin Table 2, categorizing each method according to the classifications\\nestablished in \u00a7 4.3 for clear distinction.', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n')"
            ]
          },
          "execution_count": 49,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# Preview a Document\n",
        "ctrl_docs[6]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "OgGAXjYPkz4k"
      },
      "source": [
        "## Create Pinecone serverless index to store and retrieve Document Nodes\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 51,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "suwf8mF-f85h",
        "outputId": "91c454b3-c201-49bf-e3f9-9c706be79e32"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "os.environ['PINECONE_API_KEY'] = \"<your-key-from-app.pinecone.io>\"  # REPLACE THIS WITH YOUR API KEY!\n",
        "pinecone_api_key = os.getenv(\"PINECONE_API_KEY\")\n",
        "\n",
        "# Initialize connection to Pinecone\n",
        "pc = PineconeGRPC(api_key=pinecone_api_key)\n",
        "index_name = \"structured-data-example\"\n",
        "\n",
        "if index_name not in pc.list_indexes().names():\n",
        "    pc.create_index(\n",
        "        index_name,\n",
        "        dimension=1536,  # Dimensions match encoder (embedder/vectorizer) you will use downstream, ada-002 from OpenAI.\n",
        "        spec=ServerlessSpec(cloud=\"aws\", region=\"us-west-2\"),\n",
        "    )\n",
        "\n",
        "# Initialize your index\n",
        "pinecone_index = pc.Index(index_name)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 52,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 88
        },
        "id": "3VEiPjDk8smd",
        "outputId": "d982a786-d743-4d79-c213-45208eac3a42"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/plain": [
              "{'dimension': 1536,\n",
              " 'index_fullness': 0.0,\n",
              " 'namespaces': {'': {'vector_count': 0}},\n",
              " 'total_vector_count': 0}"
            ]
          },
          "execution_count": 52,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# Confirm creation of your index & that (if new) it has no vectors in it yet.\n",
        "pinecone_index.describe_index_stats()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 35,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "QeJxnniVg-Pk",
        "outputId": "d483486e-ecf9-4535-bcee-970dc22404c6"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# If for any reason you want to delete your Pinecone index and start over, execute this code:\n",
        "# pc.delete_index(index_name)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "MfXGt584gDAN"
      },
      "source": [
        "## Connect to Pinecone via LlamaIndex and build indexing pipeline"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Q55FipLo9GR0"
      },
      "source": [
        "Below, you will build an indexing pipeline via LlamaIndex. You will upload your initial batch of vectors into a Pinecone index, in the `\"control\"` namespace. You will then use this namespace to compare and contrast downstream LLM answers to variants in your experiment.\n",
        "\n",
        "Note: You will need an OpenAI API key for this step."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 53,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "u2CWYlOM-SGB",
        "outputId": "2d795a88-720c-408b-efd3-7392c05076e7"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# Set/Get your OpenAI API Key\n",
        "\n",
        "os.environ['OPENAI_API_KEY'] = \"<your-openai-key>\"  # REPLACE THIS WITH YOUR API KEY!\n",
        "openai_api_key = os.getenv(\"OPENAI_API_KEY\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 54,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "RGH76RnKwBus",
        "outputId": "013bb7c9-6163-4e96-fc71-13f4a2247609"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# Declare embedding model you will use throughout notebook:\n",
        "# OpenAI's ada-002 text embedding modal is the model you will use both for Node parsing and for vectorization of PDF contents\n",
        "EMBED_MODEL = OpenAIEmbedding(api_key=openai_api_key)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 55,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "Jf9lGN6hCEbG",
        "outputId": "e267bd88-6737-4c1d-d69a-a999cc37a8ec"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# You will need to re-define Pinecone as a LlamaIndex PineconeVectorStore obj when you add namespaces, so build a\n",
        "# function to help you do that:\n",
        "def initialize_vector_store(index: PineconeGRPC, namespace: str) -> PineconeVectorStore:\n",
        "    \"\"\"\n",
        "    Initialize Pinecone index as a VectorStore obj.\n",
        "\n",
        "    :param index: Pinecone serverless index.\n",
        "    :param namespace: Namespace constraint you want on your queries, indexing operations, etc. when using this vector store.\n",
        "    :return: PineconeVectorStore obj.\n",
        "    \"\"\"\n",
        "    return PineconeVectorStore(pinecone_index=index, namespace=namespace)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 56,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "k2hgX5kjkzUB",
        "outputId": "f4292a6b-8e04-4ce6-932b-0ba14c5073ad"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "def run_indexing_pipeline(vector_store, documents, embed_model=EMBED_MODEL):\n",
        "    # Define pipeline stages\n",
        "    pipeline = IngestionPipeline(\n",
        "        transformations=[\n",
        "            # CleanTextForOpenAI(),  # Clean doc text\n",
        "            SemanticSplitterNodeParser(\n",
        "                buffer_size=1,\n",
        "                breakpoint_percentile_threshold=95,\n",
        "                embed_model=embed_model,\n",
        "                ),\n",
        "            embed_model,  # Vectorize nodes\n",
        "            ],\n",
        "        vector_store=vector_store # Index into Pinecone\n",
        "        )\n",
        "\n",
        "    # Run documents through pipeline\n",
        "    return pipeline.run(documents=documents)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 57,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 49,
          "referenced_widgets": [
            "215e257d9fb04bccb56294e896980191",
            "f19030ae29974a40960c10b74da57448",
            "ef005b3de2d54e6daebf4c5e8646c9fc",
            "cd514cf0a991465181bda8f6dddd50b2",
            "8ab1d70479dd474b99130c1b884d9841",
            "6d8ad326ae7549dfb345f920aa4f80a2",
            "73b3ed9119c34b19a6053f7bcddca07f",
            "6f5a6d184238401781982f74518f5bc0",
            "ba5c0de8a8814cd1a885c4d75dbeecf6",
            "4cc572b919844d83adbc9d8200ba333b",
            "aa4fbb29126d46b383f69a48c91f8adc"
          ]
        },
        "id": "34jiuAie_OSG",
        "outputId": "5a798503-bc78-479f-84a9-5f5b6c8110b0"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "Upserted vectors: 100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 48/48 [00:00<00:00, 88.20it/s]\n"
          ]
        }
      ],
      "source": [
        "# Declare namespace you will put your first batch of vectors into:\n",
        "ctrl_namespace = 'control'\n",
        "\n",
        "# Initialize vector store w/control namespace\n",
        "ctrl_vector_store = initialize_vector_store(pinecone_index, ctrl_namespace)\n",
        "\n",
        "# Run pipeline\n",
        "output = run_indexing_pipeline(ctrl_vector_store, ctrl_docs)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 58,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 88
        },
        "id": "gY5s2MRFlXYB",
        "outputId": "c5fb33d8-81f4-4d57-d649-8c128b7a041f"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/plain": [
              "{'dimension': 1536,\n",
              " 'index_fullness': 0.0,\n",
              " 'namespaces': {'control': {'vector_count': 48}},\n",
              " 'total_vector_count': 48}"
            ]
          },
          "execution_count": 58,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# Confirm your docs made it to the index, in the right namespace\n",
        "pinecone_index.describe_index_stats()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "gBxWx48DloCD"
      },
      "source": [
        "# Build RAG pipeline, background on experiments\n",
        "\n",
        "You will run a variety of RAG experiments to figure out which way of vectorizing table data works best (i.e. provides the most accurate answers).\n",
        "\n",
        "You will run two families of experiments:\n",
        "1. Baseline RAG experiment where you do not do anything special to your PDF (this is the \"control\" variant)\n",
        "2. Experimetns where you explicitly vectorize the extracted table elements (`df_1` and `df_2`) in different ways. The different ways you will experiment with are:\n",
        "- Concatenating all row data (`v1`)\n",
        "- Concatenating all row data with header data, too (`v2`)\n",
        "- Concatenating all row data with header data, and with table description data (`v3`)\n",
        "- Injecting table values into a natural language template (`v4`)\n",
        "\n",
        "\n",
        "You ask your LLM the same 7 questions (defined below) across all experiment variants."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "97s81i7MlB6J"
      },
      "source": [
        "## Questions\n",
        "\n",
        "You will ask the following 7 questions during each experiment. The answers were given by humans who read the article."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 42,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "bi8mbrHAmxJ1",
        "outputId": "179a4f84-2f01-4f56-f708-21b3b21fdb7c"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "QUERIES = [\n",
        "    \"How does the average query length compare to the average document length in table 1?\",\n",
        "    \"What are the Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2?\",\n",
        "    \"What impact do natural language explanations (NLEs) have on improving the calibration and overall effectiveness of these models in document ranking tasks?\",\n",
        "    \"How do i interpret table 2's calibration and ranking scores?\",\n",
        "    \"What are the weights of \\\"Uncalibrated monoBERT\\\" tuned on?\",\n",
        "    \"What category was used to build and train literal explanation + BERT? what does this category mean?\",\n",
        "    \"Is the 'trec-dl' in table 1 the same as the 'trec' in table 2?\"\n",
        "]\n",
        "\n",
        "ANSWERS = [\n",
        "    \"The average query length is shorter in TREC (8) than it is in NTCIR (22). The average doc length is also shorter in TREC (70.9 vs 493.2)\",\n",
        "    \"TREC: 0.529; NTCIR: 0.340.\",\n",
        "    \"NLEs lead to better calibrated neural rankers while maintaining or even boosting ranking performance in most scenarios\",\n",
        "    \"Lower is better for calibration, higher is better for ranking\",\n",
        "    \"MSMarco\",\n",
        "    \"Category F: training nle-based neural rankers on calibration data.\",\n",
        "    \"Yes\"\n",
        "]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "UxJqv8rgVQcp"
      },
      "source": [
        "## Build Control RAG pipeline\n",
        "\n",
        "You will use the following RAG pipeline for each of your experiments. It fetches the top `5` semantic search results from Pinecone to use as context to send to your LLM.\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 43,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "aP3sKC68R7_7",
        "outputId": "0129ea71-5524-4925-b9f7-c2f9fb52cfa2"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "def run_rag_pipeline(vector_store, queries, k=5, filters=None):\n",
        "    \"\"\"\n",
        "    Send queries to an LLM, having it take context from a vector store (and namespace).\n",
        "\n",
        "    :param vector_store: Your Pinecone vector store.\n",
        "    :param queries: The queries you want to ask your LLM.\n",
        "    :param k: The number of results you want retrieved as context from your Pinecone index.\n",
        "    :param filters: Option to add metadata filters to request if desired.\n",
        "    :return: Tuple of responses from your LLM.\n",
        "    \"\"\"\n",
        "\n",
        "    # Instantiate VectorStoreIndex object from our vector_store object\n",
        "    vector_index = VectorStoreIndex.from_vector_store(vector_store=vector_store)\n",
        "\n",
        "    if not filters:\n",
        "        retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=k, namespace=vector_store.namespace)\n",
        "    else:\n",
        "        retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=k, namespace=vector_store.namespace, filters=filters)\n",
        "\n",
        "    # Query engine\n",
        "    query_engine = RetrieverQueryEngine(retriever=retriever)\n",
        "\n",
        "    # Pass our 7 test queries\n",
        "    responses = ()\n",
        "    for i in queries:\n",
        "        response = query_engine.query(i).response\n",
        "        responses += (response, )\n",
        "\n",
        "    return responses\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "LkvQKVs5Br2B"
      },
      "source": [
        "# Run experiments"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "hZoI6qszB0Zc"
      },
      "source": [
        "## Control variant RAG pipeline"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 99,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "3w_YaeMHlwo4",
        "outputId": "5a954bec-b08f-426b-f29b-a4b459b4005f"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# Run RAG pipeline for control use case\n",
        "one_ctrl, two_ctrl, three_ctrl, four_ctrl, five_ctrl, six_ctrl, seven_ctrl = run_rag_pipeline(ctrl_vector_store, QUERIES)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 100,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 463
        },
        "id": "TowNyNO8cp8k",
        "outputId": "99eb856c-4faa-4530-a451-c1e222ec94e2"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "One: The average query length is shorter compared to the average document length in Table 1.\n",
            "-----\n",
            "Two: 0.529 and 0.534\n",
            "-----\n",
            "Three: Natural language explanations (NLEs) significantly enhance the scale calibration of neural rankers, often maintaining or even boosting ranking performance in most scenarios. They provide valuable insights for document differentiation and lead to statistically significant improvements in scale calibration, resulting in lower calibration error values compared to other calibration methods. Additionally, NLEs help in elucidating the rationale behind system decisions and enhancing task efficacy, ultimately improving the overall effectiveness of these models in document ranking tasks.\n",
            "-----\n",
            "Four: Interpreting Table 2's calibration and ranking scores involves understanding that lower values are better for calibration metrics such as CB-ECE, ECE, and MSE. The table compares the performance of different methods on two scale calibration datasets, TREC and NTCIR, using metrics like nDCG, nDCG@10, CB-ECE, ECE, and MSE. Significant improvements over a baseline method are marked with a dagger symbol. The values in the table represent the performance of each method in terms of ranking and calibration on the respective datasets.       \n",
            "Five: The weights of \"Uncalibrated monoBERT\" are fine-tuned on MS MARCO.\n",
            "-----\n",
            "Six: Category F was used to build and train the literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. Specifically, in the scenario of the literal explanation approach, where each input is represented with two meta NLEs (one for relevance and one for non-relevance), an additional processing step is involved. The hidden states obtained from encoding both NLEs are concatenated, and this concatenated representation is fed into an additional linear layer to transform these combined hidden states into a final ranking score.\n",
            "-----\n",
            "Seven: Yes, the 'trec-dl' in Table 1 is the same as the 'TREC' in Table 2.\n"
          ]
        }
      ],
      "source": [
        "print(f\"One: {one_ctrl}\\n-----\\nTwo: {two_ctrl}\\n-----\\nThree: {three_ctrl}\\n-----\\nFour: {four_ctrl}\\\n",
        "       \\nFive: {five_ctrl}\\n-----\\nSix: {six_ctrl}\\n-----\\nSeven: {seven_ctrl}\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 101,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 651
        },
        "id": "Zv4gOIXZaue_",
        "outputId": "0801bf6f-2990-4412-da6e-cf7ba7d0bae5"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.google.colaboratory.intrinsic+json": {
              "summary": "{\n  \"name\": \"exp_results\",\n  \"rows\": 7,\n  \"fields\": [\n    {\n      \"column\": \"ANSWER\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 7,\n        \"samples\": [\n          \"The average query length is shorter in TREC (8) than it is in NTCIR (22). The average doc length is also shorter in TREC (70.9 vs 493.2)\",\n          \"TREC: 0.529; NTCIR: 0.340.\",\n          \"Category F: training nle-based neural rankers on calibration data.\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"control\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 7,\n        \"samples\": [\n          \"The average query length is shorter compared to the average document length in Table 1.\",\n          \"0.529 and 0.534\",\n          \"Category F was used to build and train the literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. Specifically, in the scenario of the literal explanation approach, where each input is represented with two meta NLEs (one for relevance and one for non-relevance), an additional processing step is involved. The hidden states obtained from encoding both NLEs are concatenated, and this concatenated representation is fed into an additional linear layer to transform these combined hidden states into a final ranking score.\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}",
              "type": "dataframe",
              "variable_name": "exp_results"
            },
            "text/html": [
              "\n",
              "  <div id=\"df-abe9c9e0-fd12-4fc5-bd68-9443392de388\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>ANSWER</th>\n",
              "      <th>control</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>How does the average query length compare to the average document length in table 1?</th>\n",
              "      <td>The average query length is shorter in TREC (8) than it is in NTCIR (22). The average doc length is also shorter in TREC (70.9 vs 493.2)</td>\n",
              "      <td>The average query length is shorter compared to the average document length in Table 1.</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>What are the Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2?</th>\n",
              "      <td>TREC: 0.529; NTCIR: 0.340.</td>\n",
              "      <td>0.529 and 0.534</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>What impact do natural language explanations (NLEs) have on improving the calibration and overall effectiveness of these models in document ranking tasks?</th>\n",
              "      <td>NLEs lead to better calibrated neural rankers while maintaining or even boosting ranking performance in most scenarios</td>\n",
              "      <td>Natural language explanations (NLEs) significantly enhance the scale calibration of neural rankers, often maintaining or even boosting ranking performance in most scenarios. They provide valuable insights for document differentiation and lead to statistically significant improvements in scale calibration, resulting in lower calibration error values compared to other calibration methods. Additi...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>How do i interpret table 2's calibration and ranking scores?</th>\n",
              "      <td>Lower is better for calibration, higher is better for ranking</td>\n",
              "      <td>Interpreting Table 2's calibration and ranking scores involves understanding that lower values are better for calibration metrics such as CB-ECE, ECE, and MSE. The table compares the performance of different methods on two scale calibration datasets, TREC and NTCIR, using metrics like nDCG, nDCG@10, CB-ECE, ECE, and MSE. Significant improvements over a baseline method are marked with a dagger ...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>What are the weights of \"Uncalibrated monoBERT\" tuned on?</th>\n",
              "      <td>MSMarco</td>\n",
              "      <td>The weights of \"Uncalibrated monoBERT\" are fine-tuned on MS MARCO.</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>What category was used to build and train literal explanation + BERT? what does this category mean?</th>\n",
              "      <td>Category F: training nle-based neural rankers on calibration data.</td>\n",
              "      <td>Category F was used to build and train the literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. Specifically, in the scenario of the literal explanation approach, where each input is represented with two meta NLEs (one for relevance an...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>Is the 'trec-dl' in table 1 the same as the 'trec' in table 2?</th>\n",
              "      <td>Yes</td>\n",
              "      <td>Yes, the 'trec-dl' in Table 1 is the same as the 'TREC' in Table 2.</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-abe9c9e0-fd12-4fc5-bd68-9443392de388')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-abe9c9e0-fd12-4fc5-bd68-9443392de388 button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-abe9c9e0-fd12-4fc5-bd68-9443392de388');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "<div id=\"df-2c0ea11b-f60a-4fd2-aed9-1ff4a7d18f96\">\n",
              "  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-2c0ea11b-f60a-4fd2-aed9-1ff4a7d18f96')\"\n",
              "            title=\"Suggest charts\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "     width=\"24px\">\n",
              "    <g>\n",
              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
              "    </g>\n",
              "</svg>\n",
              "  </button>\n",
              "\n",
              "<style>\n",
              "  .colab-df-quickchart {\n",
              "      --bg-color: #E8F0FE;\n",
              "      --fill-color: #1967D2;\n",
              "      --hover-bg-color: #E2EBFA;\n",
              "      --hover-fill-color: #174EA6;\n",
              "      --disabled-fill-color: #AAA;\n",
              "      --disabled-bg-color: #DDD;\n",
              "  }\n",
              "\n",
              "  [theme=dark] .colab-df-quickchart {\n",
              "      --bg-color: #3B4455;\n",
              "      --fill-color: #D2E3FC;\n",
              "      --hover-bg-color: #434B5C;\n",
              "      --hover-fill-color: #FFFFFF;\n",
              "      --disabled-bg-color: #3B4455;\n",
              "      --disabled-fill-color: #666;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart {\n",
              "    background-color: var(--bg-color);\n",
              "    border: none;\n",
              "    border-radius: 50%;\n",
              "    cursor: pointer;\n",
              "    display: none;\n",
              "    fill: var(--fill-color);\n",
              "    height: 32px;\n",
              "    padding: 0;\n",
              "    width: 32px;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart:hover {\n",
              "    background-color: var(--hover-bg-color);\n",
              "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "    fill: var(--button-hover-fill-color);\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart-complete:disabled,\n",
              "  .colab-df-quickchart-complete:disabled:hover {\n",
              "    background-color: var(--disabled-bg-color);\n",
              "    fill: var(--disabled-fill-color);\n",
              "    box-shadow: none;\n",
              "  }\n",
              "\n",
              "  .colab-df-spinner {\n",
              "    border: 2px solid var(--fill-color);\n",
              "    border-color: transparent;\n",
              "    border-bottom-color: var(--fill-color);\n",
              "    animation:\n",
              "      spin 1s steps(1) infinite;\n",
              "  }\n",
              "\n",
              "  @keyframes spin {\n",
              "    0% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "      border-left-color: var(--fill-color);\n",
              "    }\n",
              "    20% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    30% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    40% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    60% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    80% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "    90% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "  }\n",
              "</style>\n",
              "\n",
              "  <script>\n",
              "    async function quickchart(key) {\n",
              "      const quickchartButtonEl =\n",
              "        document.querySelector('#' + key + ' button');\n",
              "      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
              "      quickchartButtonEl.classList.add('colab-df-spinner');\n",
              "      try {\n",
              "        const charts = await google.colab.kernel.invokeFunction(\n",
              "            'suggestCharts', [key], {});\n",
              "      } catch (error) {\n",
              "        console.error('Error during call to suggestCharts:', error);\n",
              "      }\n",
              "      quickchartButtonEl.classList.remove('colab-df-spinner');\n",
              "      quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
              "    }\n",
              "    (() => {\n",
              "      let quickchartButtonEl =\n",
              "        document.querySelector('#df-2c0ea11b-f60a-4fd2-aed9-1ff4a7d18f96 button');\n",
              "      quickchartButtonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "    })();\n",
              "  </script>\n",
              "</div>\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "text/plain": [
              "                                                                                                                                                                                                                                                                                              ANSWER  \\\n",
              "How does the average query length compare to the average document length in table 1?                                                                        The average query length is shorter in TREC (8) than it is in NTCIR (22). The average doc length is also shorter in TREC (70.9 vs 493.2)   \n",
              "What are the Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2?                                                                                                                                                                              TREC: 0.529; NTCIR: 0.340.   \n",
              "What impact do natural language explanations (NLEs) have on improving the calibration and overall effectiveness of these models in document ranking tasks?                    NLEs lead to better calibrated neural rankers while maintaining or even boosting ranking performance in most scenarios   \n",
              "How do i interpret table 2's calibration and ranking scores?                                                                                                                                                                           Lower is better for calibration, higher is better for ranking   \n",
              "What are the weights of \"Uncalibrated monoBERT\" tuned on?                                                                                                                                                                                                                                    MSMarco   \n",
              "What category was used to build and train literal explanation + BERT? what does this category mean?                                                                                                                               Category F: training nle-based neural rankers on calibration data.   \n",
              "Is the 'trec-dl' in table 1 the same as the 'trec' in table 2?                                                                                                                                                                                                                                   Yes   \n",
              "\n",
              "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    control  \n",
              "How does the average query length compare to the average document length in table 1?                                                                                                                                                                                                                                                                                                                                                                                                The average query length is shorter compared to the average document length in Table 1.  \n",
              "What are the Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2?                                                                                                                                                                                                                                                                                                                                                                                                                                                                0.529 and 0.534  \n",
              "What impact do natural language explanations (NLEs) have on improving the calibration and overall effectiveness of these models in document ranking tasks?  Natural language explanations (NLEs) significantly enhance the scale calibration of neural rankers, often maintaining or even boosting ranking performance in most scenarios. They provide valuable insights for document differentiation and lead to statistically significant improvements in scale calibration, resulting in lower calibration error values compared to other calibration methods. Additi...  \n",
              "How do i interpret table 2's calibration and ranking scores?                                                                                                Interpreting Table 2's calibration and ranking scores involves understanding that lower values are better for calibration metrics such as CB-ECE, ECE, and MSE. The table compares the performance of different methods on two scale calibration datasets, TREC and NTCIR, using metrics like nDCG, nDCG@10, CB-ECE, ECE, and MSE. Significant improvements over a baseline method are marked with a dagger ...  \n",
              "What are the weights of \"Uncalibrated monoBERT\" tuned on?                                                                                                                                                                                                                                                                                                                                                                                                                                                The weights of \"Uncalibrated monoBERT\" are fine-tuned on MS MARCO.  \n",
              "What category was used to build and train literal explanation + BERT? what does this category mean?                                                         Category F was used to build and train the literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. Specifically, in the scenario of the literal explanation approach, where each input is represented with two meta NLEs (one for relevance an...  \n",
              "Is the 'trec-dl' in table 1 the same as the 'trec' in table 2?                                                                                                                                                                                                                                                                                                                                                                                                                                          Yes, the 'trec-dl' in Table 1 is the same as the 'TREC' in Table 2.  "
            ]
          },
          "execution_count": 101,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# Start a dataframe that you can continue to add your results to as you run future experiments\n",
        "ctrl_responses = [{'ANSWER': ANSWERS[0], 'control': one_ctrl},\n",
        "                {'ANSWER': ANSWERS[1], 'control': two_ctrl},\n",
        "                {'ANSWER': ANSWERS[2], 'control': three_ctrl},\n",
        "                {'ANSWER': ANSWERS[3], 'control': four_ctrl},\n",
        "                {'ANSWER': ANSWERS[4], 'control': five_ctrl},\n",
        "                {'ANSWER': ANSWERS[5], 'control': six_ctrl},\n",
        "                {'ANSWER': ANSWERS[6], 'control': seven_ctrl}]\n",
        "\n",
        "exp_results = pd.DataFrame(data=ctrl_responses, index=[QUERIES[0], QUERIES[1], QUERIES[2], QUERIES[3],\n",
        "                                                       QUERIES[4], QUERIES[5], QUERIES[6]])\n",
        "\n",
        "exp_results"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "qC_SV_K8oQfi"
      },
      "source": [
        "## Variant 1: concatenate row values\n",
        "\n",
        "For this variant, you will concatenate each row of your extracted tables (stored in `df1` and `df2`). You will then create vectors of each of these rows, upsert them into a Pinecone namespace, and run a RAG pipeline to see how your LLM's responses differ from the actual answers and the control variant's answers, given this vectorization strategy."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 102,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 206
        },
        "id": "JIxk-iZCWdLS",
        "outputId": "82875e4f-3e73-4183-f2ca-da253cd3a0ab"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.google.colaboratory.intrinsic+json": {
              "summary": "{\n  \"name\": \"df1\",\n  \"rows\": 6,\n  \"fields\": [\n    {\n      \"column\": \"Metric\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 6,\n        \"samples\": [\n          \"# Queries (Train/Val/Test)\",\n          \"Avg. # docs per query\",\n          \"Avg. doc. length\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"TREC-DL\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 6,\n        \"samples\": [\n          \"| 97/53/67\",\n          \"282.7\",\n          \"70.9\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"NTCIR-14\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 6,\n        \"samples\": [\n          \"48/16/16\",\n          \"345.3\",\n          \"493.2\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}",
              "type": "dataframe",
              "variable_name": "df1"
            },
            "text/html": [
              "\n",
              "  <div id=\"df-a7a9ad24-90fd-43cb-b1f7-85ecda0519bf\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>Metric</th>\n",
              "      <th>TREC-DL</th>\n",
              "      <th>NTCIR-14</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td># Queries (Train/Val/Test)</td>\n",
              "      <td>| 97/53/67</td>\n",
              "      <td>48/16/16</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>Avg. # docs per query</td>\n",
              "      <td>282.7</td>\n",
              "      <td>345.3</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>Levels of relevance</td>\n",
              "      <td>4</td>\n",
              "      <td>5</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>Label dist. (low to high)</td>\n",
              "      <td>58/22/14/6</td>\n",
              "      <td>48/23/17/8/3</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>Avg. query length</td>\n",
              "      <td>8.0</td>\n",
              "      <td>22.0</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-a7a9ad24-90fd-43cb-b1f7-85ecda0519bf')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-a7a9ad24-90fd-43cb-b1f7-85ecda0519bf button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-a7a9ad24-90fd-43cb-b1f7-85ecda0519bf');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "<div id=\"df-6419732b-6e36-4d0c-b5dc-c84cd452df39\">\n",
              "  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-6419732b-6e36-4d0c-b5dc-c84cd452df39')\"\n",
              "            title=\"Suggest charts\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "     width=\"24px\">\n",
              "    <g>\n",
              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
              "    </g>\n",
              "</svg>\n",
              "  </button>\n",
              "\n",
              "<style>\n",
              "  .colab-df-quickchart {\n",
              "      --bg-color: #E8F0FE;\n",
              "      --fill-color: #1967D2;\n",
              "      --hover-bg-color: #E2EBFA;\n",
              "      --hover-fill-color: #174EA6;\n",
              "      --disabled-fill-color: #AAA;\n",
              "      --disabled-bg-color: #DDD;\n",
              "  }\n",
              "\n",
              "  [theme=dark] .colab-df-quickchart {\n",
              "      --bg-color: #3B4455;\n",
              "      --fill-color: #D2E3FC;\n",
              "      --hover-bg-color: #434B5C;\n",
              "      --hover-fill-color: #FFFFFF;\n",
              "      --disabled-bg-color: #3B4455;\n",
              "      --disabled-fill-color: #666;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart {\n",
              "    background-color: var(--bg-color);\n",
              "    border: none;\n",
              "    border-radius: 50%;\n",
              "    cursor: pointer;\n",
              "    display: none;\n",
              "    fill: var(--fill-color);\n",
              "    height: 32px;\n",
              "    padding: 0;\n",
              "    width: 32px;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart:hover {\n",
              "    background-color: var(--hover-bg-color);\n",
              "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "    fill: var(--button-hover-fill-color);\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart-complete:disabled,\n",
              "  .colab-df-quickchart-complete:disabled:hover {\n",
              "    background-color: var(--disabled-bg-color);\n",
              "    fill: var(--disabled-fill-color);\n",
              "    box-shadow: none;\n",
              "  }\n",
              "\n",
              "  .colab-df-spinner {\n",
              "    border: 2px solid var(--fill-color);\n",
              "    border-color: transparent;\n",
              "    border-bottom-color: var(--fill-color);\n",
              "    animation:\n",
              "      spin 1s steps(1) infinite;\n",
              "  }\n",
              "\n",
              "  @keyframes spin {\n",
              "    0% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "      border-left-color: var(--fill-color);\n",
              "    }\n",
              "    20% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    30% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    40% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    60% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    80% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "    90% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "  }\n",
              "</style>\n",
              "\n",
              "  <script>\n",
              "    async function quickchart(key) {\n",
              "      const quickchartButtonEl =\n",
              "        document.querySelector('#' + key + ' button');\n",
              "      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
              "      quickchartButtonEl.classList.add('colab-df-spinner');\n",
              "      try {\n",
              "        const charts = await google.colab.kernel.invokeFunction(\n",
              "            'suggestCharts', [key], {});\n",
              "      } catch (error) {\n",
              "        console.error('Error during call to suggestCharts:', error);\n",
              "      }\n",
              "      quickchartButtonEl.classList.remove('colab-df-spinner');\n",
              "      quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
              "    }\n",
              "    (() => {\n",
              "      let quickchartButtonEl =\n",
              "        document.querySelector('#df-6419732b-6e36-4d0c-b5dc-c84cd452df39 button');\n",
              "      quickchartButtonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "    })();\n",
              "  </script>\n",
              "</div>\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "text/plain": [
              "                       Metric     TREC-DL      NTCIR-14\n",
              "0  # Queries (Train/Val/Test)  | 97/53/67      48/16/16\n",
              "1       Avg. # docs per query       282.7         345.3\n",
              "2         Levels of relevance           4             5\n",
              "3   Label dist. (low to high)  58/22/14/6  48/23/17/8/3\n",
              "4           Avg. query length         8.0          22.0"
            ]
          },
          "execution_count": 102,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# Quick reminder what your extracted tables look like:\n",
        "# Table 1\n",
        "df1.head()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 48,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 310
        },
        "id": "_aPMmd3cWdQz",
        "outputId": "0cee9382-3516-4c2f-8be3-9ddbf15139f9"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.google.colaboratory.intrinsic+json": {
              "summary": "{\n  \"name\": \"df2\",\n  \"rows\": 8,\n  \"fields\": [\n    {\n      \"column\": \"Method\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 8,\n        \"samples\": [\n          \"Post hoc + monoBERT\",\n          \"Post hoc + MC Sampling LLM\",\n          \"Uncalibrated monoBERT\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Category\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 6,\n        \"samples\": [\n          \"A\",\n          \"B\",\n          \"F\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"TREC-ndcg\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 7,\n        \"samples\": [\n          \"0.799\",\n          \"0.776\",\n          \"0.815*\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"TREC-ndcg@10\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 7,\n        \"samples\": [\n          \"0.494\",\n          \"0.422\",\n          \"0.529%\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"TREC-CB-ECE\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 8,\n        \"samples\": [\n          \"1.141\",\n          \"1.165\",\n          \"1.205\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"TREC-ECE\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 8,\n        \"samples\": [\n          \"0.125\",\n          \"0.145\",\n          \"0.320\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"TREC-MSE\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 8,\n        \"samples\": [\n          \"0.684\",\n          \"0.673\",\n          \"0.773\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"NTCIR-ndcg\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 7,\n        \"samples\": [\n          \"| 0.735\",\n          \"0.696\",\n          \"0.742\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"NTCIR-ndcg@10\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 7,\n        \"samples\": [\n          \"0.337\",\n          \"0.268\",\n          \"0.340\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"NTCIR-CB-ECE\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 8,\n        \"samples\": [\n          \"1.624\",\n          \"1.677\",\n          \"1.757\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"NTCIR-ECE\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 8,\n        \"samples\": [\n          \"0.457\",\n          \"0.472\",\n          \"0.799\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"NTCIR-MSE\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 8,\n        \"samples\": [\n          \"1.462\",\n          \"\\u20181.540\",\n          \"1.824\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}",
              "type": "dataframe",
              "variable_name": "df2"
            },
            "text/html": [
              "\n",
              "  <div id=\"df-b5e12db8-81bb-4bb4-a332-17c90730a4ce\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>Method</th>\n",
              "      <th>Category</th>\n",
              "      <th>TREC-ndcg</th>\n",
              "      <th>TREC-ndcg@10</th>\n",
              "      <th>TREC-CB-ECE</th>\n",
              "      <th>TREC-ECE</th>\n",
              "      <th>TREC-MSE</th>\n",
              "      <th>NTCIR-ndcg</th>\n",
              "      <th>NTCIR-ndcg@10</th>\n",
              "      <th>NTCIR-CB-ECE</th>\n",
              "      <th>NTCIR-ECE</th>\n",
              "      <th>NTCIR-MSE</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>Uncalibrated monoBERT</td>\n",
              "      <td>A</td>\n",
              "      <td>0.799</td>\n",
              "      <td>0.494</td>\n",
              "      <td>1.205</td>\n",
              "      <td>0.320</td>\n",
              "      <td>0.773</td>\n",
              "      <td>| 0.735</td>\n",
              "      <td>0.337</td>\n",
              "      <td>1.757</td>\n",
              "      <td>0.799</td>\n",
              "      <td>1.824</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>Post hoc + monoBERT</td>\n",
              "      <td>B</td>\n",
              "      <td>0.799</td>\n",
              "      <td>0.494</td>\n",
              "      <td>1.141</td>\n",
              "      <td>0.125</td>\n",
              "      <td>0.684</td>\n",
              "      <td>| 0.735</td>\n",
              "      <td>0.337</td>\n",
              "      <td>1.624</td>\n",
              "      <td>0.457</td>\n",
              "      <td>1.462</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>Finetune monoBERT</td>\n",
              "      <td>C</td>\n",
              "      <td>0.776</td>\n",
              "      <td>0.422</td>\n",
              "      <td>1.093</td>\n",
              "      <td>0.221</td>\n",
              "      <td>-~\u2014\u00ab0.721 |</td>\n",
              "      <td>0.696</td>\n",
              "      <td>0.268</td>\n",
              "      <td>1.843</td>\n",
              "      <td>0.709</td>\n",
              "      <td>\u20181.874</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>Finetune BERT</td>\n",
              "      <td>C</td>\n",
              "      <td>0.738</td>\n",
              "      <td>0.327</td>\n",
              "      <td>1.253</td>\n",
              "      <td>0.266</td>\n",
              "      <td>~=\u20140.785_ |</td>\n",
              "      <td>_ 0.727</td>\n",
              "      <td>0.285</td>\n",
              "      <td>1.756</td>\n",
              "      <td>0.546</td>\n",
              "      <td>\u00ab1.416</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>LLM prompting w/ rubrics</td>\n",
              "      <td>D</td>\n",
              "      <td>0.786</td>\n",
              "      <td>0.457</td>\n",
              "      <td>1.000'</td>\n",
              "      <td>1.246</td>\n",
              "      <td>\u00ab2.137</td>\n",
              "      <td>| 0.728</td>\n",
              "      <td>0.328</td>\n",
              "      <td>1.2947</td>\n",
              "      <td>1.194</td>\n",
              "      <td>2.773</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-b5e12db8-81bb-4bb4-a332-17c90730a4ce')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-b5e12db8-81bb-4bb4-a332-17c90730a4ce button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-b5e12db8-81bb-4bb4-a332-17c90730a4ce');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "<div id=\"df-6973a6a0-303b-4a8b-ae1d-60ef3954f3ca\">\n",
              "  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-6973a6a0-303b-4a8b-ae1d-60ef3954f3ca')\"\n",
              "            title=\"Suggest charts\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "     width=\"24px\">\n",
              "    <g>\n",
              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
              "    </g>\n",
              "</svg>\n",
              "  </button>\n",
              "\n",
              "<style>\n",
              "  .colab-df-quickchart {\n",
              "      --bg-color: #E8F0FE;\n",
              "      --fill-color: #1967D2;\n",
              "      --hover-bg-color: #E2EBFA;\n",
              "      --hover-fill-color: #174EA6;\n",
              "      --disabled-fill-color: #AAA;\n",
              "      --disabled-bg-color: #DDD;\n",
              "  }\n",
              "\n",
              "  [theme=dark] .colab-df-quickchart {\n",
              "      --bg-color: #3B4455;\n",
              "      --fill-color: #D2E3FC;\n",
              "      --hover-bg-color: #434B5C;\n",
              "      --hover-fill-color: #FFFFFF;\n",
              "      --disabled-bg-color: #3B4455;\n",
              "      --disabled-fill-color: #666;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart {\n",
              "    background-color: var(--bg-color);\n",
              "    border: none;\n",
              "    border-radius: 50%;\n",
              "    cursor: pointer;\n",
              "    display: none;\n",
              "    fill: var(--fill-color);\n",
              "    height: 32px;\n",
              "    padding: 0;\n",
              "    width: 32px;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart:hover {\n",
              "    background-color: var(--hover-bg-color);\n",
              "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "    fill: var(--button-hover-fill-color);\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart-complete:disabled,\n",
              "  .colab-df-quickchart-complete:disabled:hover {\n",
              "    background-color: var(--disabled-bg-color);\n",
              "    fill: var(--disabled-fill-color);\n",
              "    box-shadow: none;\n",
              "  }\n",
              "\n",
              "  .colab-df-spinner {\n",
              "    border: 2px solid var(--fill-color);\n",
              "    border-color: transparent;\n",
              "    border-bottom-color: var(--fill-color);\n",
              "    animation:\n",
              "      spin 1s steps(1) infinite;\n",
              "  }\n",
              "\n",
              "  @keyframes spin {\n",
              "    0% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "      border-left-color: var(--fill-color);\n",
              "    }\n",
              "    20% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    30% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    40% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    60% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    80% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "    90% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "  }\n",
              "</style>\n",
              "\n",
              "  <script>\n",
              "    async function quickchart(key) {\n",
              "      const quickchartButtonEl =\n",
              "        document.querySelector('#' + key + ' button');\n",
              "      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
              "      quickchartButtonEl.classList.add('colab-df-spinner');\n",
              "      try {\n",
              "        const charts = await google.colab.kernel.invokeFunction(\n",
              "            'suggestCharts', [key], {});\n",
              "      } catch (error) {\n",
              "        console.error('Error during call to suggestCharts:', error);\n",
              "      }\n",
              "      quickchartButtonEl.classList.remove('colab-df-spinner');\n",
              "      quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
              "    }\n",
              "    (() => {\n",
              "      let quickchartButtonEl =\n",
              "        document.querySelector('#df-6973a6a0-303b-4a8b-ae1d-60ef3954f3ca button');\n",
              "      quickchartButtonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "    })();\n",
              "  </script>\n",
              "</div>\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "text/plain": [
              "                     Method Category TREC-ndcg TREC-ndcg@10 TREC-CB-ECE  \\\n",
              "0     Uncalibrated monoBERT        A     0.799        0.494       1.205   \n",
              "1       Post hoc + monoBERT        B     0.799        0.494       1.141   \n",
              "2         Finetune monoBERT        C     0.776        0.422       1.093   \n",
              "3             Finetune BERT        C     0.738        0.327       1.253   \n",
              "4  LLM prompting w/ rubrics        D     0.786        0.457      1.000'   \n",
              "\n",
              "  TREC-ECE     TREC-MSE NTCIR-ndcg NTCIR-ndcg@10 NTCIR-CB-ECE NTCIR-ECE  \\\n",
              "0    0.320        0.773    | 0.735         0.337        1.757     0.799   \n",
              "1    0.125        0.684    | 0.735         0.337        1.624     0.457   \n",
              "2    0.221  -~\u2014\u00ab0.721 |      0.696         0.268        1.843     0.709   \n",
              "3    0.266  ~=\u20140.785_ |    _ 0.727         0.285        1.756     0.546   \n",
              "4    1.246       \u00ab2.137    | 0.728         0.328       1.2947     1.194   \n",
              "\n",
              "  NTCIR-MSE  \n",
              "0     1.824  \n",
              "1     1.462  \n",
              "2    \u20181.874  \n",
              "3    \u00ab1.416  \n",
              "4     2.773  "
            ]
          },
          "execution_count": 48,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# Table 2\n",
        "df2.head()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 49,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "D65mGZOMUuFx",
        "outputId": "7b319725-4332-4768-d897-4934e9aa19db"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# Define function to iterate through your dataframes and concatenate each row's data to itself:\n",
        "\n",
        "def concat_row_values(dataframe: pd.DataFrame) -> list[str]:\n",
        "    \"\"\"\n",
        "    Concatenate all values per row in a dataframe, separated by \", \".\n",
        "\n",
        "    :param: Dataframe containing rows you want to concatenate.\n",
        "    :return: Concatenated row values.\n",
        "    \"\"\"\n",
        "    return dataframe.apply(lambda row: ', '.join(row.astype(str)), axis=1).tolist()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 50,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "0CWTNKLZUuL2",
        "outputId": "a1e2371c-5d1b-4a60-bef4-1b3f4c2d6e1b"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "df1_concat_rows = concat_row_values(df1)\n",
        "df2_concat_rows = concat_row_values(df2)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 51,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 160
        },
        "id": "5O4SYCFRUuTQ",
        "outputId": "5d4b7d65-d535-46b1-d6eb-bd57dc84713c"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/plain": [
              "['Uncalibrated monoBERT, A, 0.799, 0.494, 1.205, 0.320, 0.773, | 0.735, 0.337, 1.757, 0.799, 1.824',\n",
              " 'Post hoc + monoBERT, B, 0.799, 0.494, 1.141, 0.125, 0.684, | 0.735, 0.337, 1.624, 0.457, 1.462',\n",
              " 'Finetune monoBERT, C, 0.776, 0.422, 1.093, 0.221, -~\u2014\u00ab0.721 |, 0.696, 0.268, 1.843, 0.709, \u20181.874',\n",
              " 'Finetune BERT, C, 0.738, 0.327, 1.253, 0.266, ~=\u20140.785_ |, _ 0.727, 0.285, 1.756, 0.546, \u00ab1.416',\n",
              " \"LLM prompting w/ rubrics, D, 0.786, 0.457, 1.000', 1.246, \u00ab2.137, | 0.728, 0.328, 1.2947, 1.194, 2.773\",\n",
              " 'Post hoc + MC Sampling LLM, E, 0.790, 0.473, 1.165, 0.145, 0.673, | 0.736, 0.364\", 1.677, 0.472, \u20181.540',\n",
              " 'Literal Explanation + BERT, F, 0.815*, 0.529%, 0.996\u00b0, 0.067*, 0.602\" |, 0.742, 0.340, 1.534\", 0.355, 1.3307',\n",
              " \"Conditional Explanation + BERT, F, 0.822, \u20140.534*, 0.862', 0.428, ~\u20140.832_ |, 0.720, 0.322, 1.405', 0.2577, 1.2907\"]"
            ]
          },
          "execution_count": 51,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# Preview, nice!\n",
        "df2_concat_rows"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 52,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "2eLbJXwiHg0j",
        "outputId": "bc62f291-1ee8-4a70-d21c-c186caad8c82"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# Now you need to write a function to turn items into aLlamaIndex Document objs so they can go in your indexing pipeline downstream:\n",
        "\n",
        "def turn_data_into_documents(rows: list[str]) -> list[Document]:\n",
        "    \"\"\"\n",
        "    Transform data into LlamaIndex Document objects.\n",
        "    Document obj: llama_index >> core >> schema.py\n",
        "\n",
        "    :param rows: Data you want to turn into Documents.\n",
        "    :return: Document objects.\n",
        "    \"\"\"\n",
        "    docs = []\n",
        "    for i in rows:\n",
        "        doc = Document(text=i)\n",
        "        docs.append(doc)\n",
        "    return docs"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 53,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "d6Pc6qCIWoi0",
        "outputId": "c48cfb3f-5b16-43bb-d348-a037233afa69"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# Turn each concatenated row into Document objs:\n",
        "df1_concat_rows_docs = turn_data_into_documents(df1_concat_rows)\n",
        "df2_concat_rows_docs = turn_data_into_documents(df2_concat_rows)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "GzNJ_372H0w8"
      },
      "source": [
        "### Run indexing pipeline"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 54,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 49,
          "referenced_widgets": [
            "29b9d7ea1f6e4504addd5ad8d8094a68",
            "c534b899e5bf427fb80780fe74711a3d",
            "6b5f68eab8914004a0d7aed9df26fab4",
            "5236fbbb0985478288e8541e86c9a1c3",
            "4d28abd8c5cc43a49ce40a45e287c80f",
            "fc8c4b4878e946bca4eaf003d51f6ced",
            "409a62cb3d1a4731bb0f11085c06de0a",
            "f7b961f8a229446fabbcd7a36a7d268b",
            "51ada96bb02d4f87ba945ad78d7c6982",
            "1650f4bf872847d49da334889f330536",
            "d1abd91d47804e94add09540adc7f8a7"
          ]
        },
        "id": "8aZ0_uzlWouy",
        "outputId": "aa26f3e7-1ff6-443c-8886-e26ce1affa2b"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "29b9d7ea1f6e4504addd5ad8d8094a68",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "Upserted vectors:   0%|          | 0/63 [00:00<?, ?it/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# Define namespace for Variant 1\n",
        "v1_namespace = 'v1'\n",
        "\n",
        "# Initialize vector store w/v1 namespace\n",
        "v1_vector_store = initialize_vector_store(pinecone_index, v1_namespace)\n",
        "\n",
        "# Define docs you'll send through indexing pipeline into v1_namespace\n",
        "# You will combine the table contents you defined above w/the regular PDF contents from control\n",
        "v1_docs = df1_concat_rows_docs + df2_concat_rows_docs + ctrl_docs\n",
        "\n",
        "# Run pipeline\n",
        "output = run_indexing_pipeline(v1_vector_store, v1_docs)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ryJ_-HMOXVhK"
      },
      "outputs": [],
      "source": [
        "# Confirm your v1 docs made it to the index, in the correct namespace\n",
        "pinecone_index.describe_index_stats()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "T40RiV0PJK12"
      },
      "source": [
        "### Run RAG pipeline"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 104,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "r-wO7f2rJMYn",
        "outputId": "ee59e04a-aa48-4338-e67b-bccd98d28e36"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# Run RAG pipeline for v1 use case\n",
        "one_v1, two_v1, three_v1, four_v1, five_v1, six_v1, seven_v1 = run_rag_pipeline(v1_vector_store, QUERIES)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 105,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 929
        },
        "id": "hSoPRIqGKRw3",
        "outputId": "23e0d559-c549-446e-c8df-d9370a3107e5"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.google.colaboratory.intrinsic+json": {
              "summary": "{\n  \"name\": \"exp_results\",\n  \"rows\": 7,\n  \"fields\": [\n    {\n      \"column\": \"ANSWER\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 7,\n        \"samples\": [\n          \"The average query length is shorter in TREC (8) than it is in NTCIR (22). The average doc length is also shorter in TREC (70.9 vs 493.2)\",\n          \"TREC: 0.529; NTCIR: 0.340.\",\n          \"Category F: training nle-based neural rankers on calibration data.\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"control\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 7,\n        \"samples\": [\n          \"The average query length is shorter compared to the average document length in Table 1.\",\n          \"0.529 and 0.534\",\n          \"Category F was used to build and train the literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. Specifically, in the scenario of the literal explanation approach, where each input is represented with two meta NLEs (one for relevance and one for non-relevance), an additional processing step is involved. The hidden states obtained from encoding both NLEs are concatenated, and this concatenated representation is fed into an additional linear layer to transform these combined hidden states into a final ranking score.\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"v1\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 7,\n        \"samples\": [\n          \"The average query length is shorter compared to the average document length in table 1.\",\n          \"The Literal Explanation + BERT method's nDCG@10 scores on both datasets in Table 2 are 0.529 for the TREC dataset and 0.602 for the NTCIR dataset.\",\n          \"Category F was used to build and train literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. The approach includes generating and aggregating natural language explanations for query-document pairs, and then using these explanations to refine the ranking scores of the neural ranker.\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}",
              "type": "dataframe",
              "variable_name": "exp_results"
            },
            "text/html": [
              "\n",
              "  <div id=\"df-ed40f169-e4d9-47ca-a23b-b7e85f81d1fd\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>ANSWER</th>\n",
              "      <th>control</th>\n",
              "      <th>v1</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>How does the average query length compare to the average document length in table 1?</th>\n",
              "      <td>The average query length is shorter in TREC (8) than it is in NTCIR (22). The average doc length is also shorter in TREC (70.9 vs 493.2)</td>\n",
              "      <td>The average query length is shorter compared to the average document length in Table 1.</td>\n",
              "      <td>The average query length is shorter compared to the average document length in table 1.</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>What are the Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2?</th>\n",
              "      <td>TREC: 0.529; NTCIR: 0.340.</td>\n",
              "      <td>0.529 and 0.534</td>\n",
              "      <td>The Literal Explanation + BERT method's nDCG@10 scores on both datasets in Table 2 are 0.529 for the TREC dataset and 0.602 for the NTCIR dataset.</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>What impact do natural language explanations (NLEs) have on improving the calibration and overall effectiveness of these models in document ranking tasks?</th>\n",
              "      <td>NLEs lead to better calibrated neural rankers while maintaining or even boosting ranking performance in most scenarios</td>\n",
              "      <td>Natural language explanations (NLEs) significantly enhance the scale calibration of neural rankers, often maintaining or even boosting ranking performance in most scenarios. They provide valuable insights for document differentiation and lead to statistically significant improvements in scale calibration, resulting in lower calibration error values compared to other calibration methods. Additi...</td>\n",
              "      <td>Natural language explanations (NLEs) significantly enhance the scale calibration of neural rankers, often maintaining or even boosting ranking performance in most scenarios. They provide valuable insights for document differentiation and lead to statistically significant improvements in scale calibration, resulting in lower calibration error values compared to other calibration methods. Additi...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>How do i interpret table 2's calibration and ranking scores?</th>\n",
              "      <td>Lower is better for calibration, higher is better for ranking</td>\n",
              "      <td>Interpreting Table 2's calibration and ranking scores involves understanding that lower values are better for calibration metrics such as CB-ECE, ECE, and MSE. The table compares the performance of different methods on two scale calibration datasets, TREC and NTCIR, using metrics like nDCG, nDCG@10, CB-ECE, ECE, and MSE. Significant improvements over a baseline method are marked with a dagger ...</td>\n",
              "      <td>Interpret the calibration and ranking scores in Table 2 by noting that lower values are better for calibration metrics such as CB-ECE, ECE, and MSE. Statistically significant improvements over the \"Platt Scaling monoBERT\" baseline are marked with a dagger symbol. The table presents the performance of different methods on two scale calibration datasets, TREC and NTCIR, with metrics like nDCG, n...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>What are the weights of \"Uncalibrated monoBERT\" tuned on?</th>\n",
              "      <td>MSMarco</td>\n",
              "      <td>The weights of \"Uncalibrated monoBERT\" are fine-tuned on MS MARCO.</td>\n",
              "      <td>The weights of \"Uncalibrated monoBERT\" are fine-tuned on MS MARCO.</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>What category was used to build and train literal explanation + BERT? what does this category mean?</th>\n",
              "      <td>Category F: training nle-based neural rankers on calibration data.</td>\n",
              "      <td>Category F was used to build and train the literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. Specifically, in the scenario of the literal explanation approach, where each input is represented with two meta NLEs (one for relevance an...</td>\n",
              "      <td>Category F was used to build and train literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. The approach includes generating and aggregating natural language explanations for query-document pairs, and then using these explanations to r...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>Is the 'trec-dl' in table 1 the same as the 'trec' in table 2?</th>\n",
              "      <td>Yes</td>\n",
              "      <td>Yes, the 'trec-dl' in Table 1 is the same as the 'TREC' in Table 2.</td>\n",
              "      <td>Yes, the 'trec-dl' in Table 1 is the same dataset as the 'TREC' dataset mentioned in Table 2.</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-ed40f169-e4d9-47ca-a23b-b7e85f81d1fd')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-ed40f169-e4d9-47ca-a23b-b7e85f81d1fd button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-ed40f169-e4d9-47ca-a23b-b7e85f81d1fd');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "<div id=\"df-2d9d4a40-3389-4ee4-be76-967ad94f4c54\">\n",
              "  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-2d9d4a40-3389-4ee4-be76-967ad94f4c54')\"\n",
              "            title=\"Suggest charts\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "     width=\"24px\">\n",
              "    <g>\n",
              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
              "    </g>\n",
              "</svg>\n",
              "  </button>\n",
              "\n",
              "<style>\n",
              "  .colab-df-quickchart {\n",
              "      --bg-color: #E8F0FE;\n",
              "      --fill-color: #1967D2;\n",
              "      --hover-bg-color: #E2EBFA;\n",
              "      --hover-fill-color: #174EA6;\n",
              "      --disabled-fill-color: #AAA;\n",
              "      --disabled-bg-color: #DDD;\n",
              "  }\n",
              "\n",
              "  [theme=dark] .colab-df-quickchart {\n",
              "      --bg-color: #3B4455;\n",
              "      --fill-color: #D2E3FC;\n",
              "      --hover-bg-color: #434B5C;\n",
              "      --hover-fill-color: #FFFFFF;\n",
              "      --disabled-bg-color: #3B4455;\n",
              "      --disabled-fill-color: #666;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart {\n",
              "    background-color: var(--bg-color);\n",
              "    border: none;\n",
              "    border-radius: 50%;\n",
              "    cursor: pointer;\n",
              "    display: none;\n",
              "    fill: var(--fill-color);\n",
              "    height: 32px;\n",
              "    padding: 0;\n",
              "    width: 32px;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart:hover {\n",
              "    background-color: var(--hover-bg-color);\n",
              "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "    fill: var(--button-hover-fill-color);\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart-complete:disabled,\n",
              "  .colab-df-quickchart-complete:disabled:hover {\n",
              "    background-color: var(--disabled-bg-color);\n",
              "    fill: var(--disabled-fill-color);\n",
              "    box-shadow: none;\n",
              "  }\n",
              "\n",
              "  .colab-df-spinner {\n",
              "    border: 2px solid var(--fill-color);\n",
              "    border-color: transparent;\n",
              "    border-bottom-color: var(--fill-color);\n",
              "    animation:\n",
              "      spin 1s steps(1) infinite;\n",
              "  }\n",
              "\n",
              "  @keyframes spin {\n",
              "    0% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "      border-left-color: var(--fill-color);\n",
              "    }\n",
              "    20% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    30% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    40% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    60% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    80% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "    90% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "  }\n",
              "</style>\n",
              "\n",
              "  <script>\n",
              "    async function quickchart(key) {\n",
              "      const quickchartButtonEl =\n",
              "        document.querySelector('#' + key + ' button');\n",
              "      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
              "      quickchartButtonEl.classList.add('colab-df-spinner');\n",
              "      try {\n",
              "        const charts = await google.colab.kernel.invokeFunction(\n",
              "            'suggestCharts', [key], {});\n",
              "      } catch (error) {\n",
              "        console.error('Error during call to suggestCharts:', error);\n",
              "      }\n",
              "      quickchartButtonEl.classList.remove('colab-df-spinner');\n",
              "      quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
              "    }\n",
              "    (() => {\n",
              "      let quickchartButtonEl =\n",
              "        document.querySelector('#df-2d9d4a40-3389-4ee4-be76-967ad94f4c54 button');\n",
              "      quickchartButtonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "    })();\n",
              "  </script>\n",
              "</div>\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "text/plain": [
              "                                                                                                                                                                                                                                                                                              ANSWER  \\\n",
              "How does the average query length compare to the average document length in table 1?                                                                        The average query length is shorter in TREC (8) than it is in NTCIR (22). The average doc length is also shorter in TREC (70.9 vs 493.2)   \n",
              "What are the Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2?                                                                                                                                                                              TREC: 0.529; NTCIR: 0.340.   \n",
              "What impact do natural language explanations (NLEs) have on improving the calibration and overall effectiveness of these models in document ranking tasks?                    NLEs lead to better calibrated neural rankers while maintaining or even boosting ranking performance in most scenarios   \n",
              "How do i interpret table 2's calibration and ranking scores?                                                                                                                                                                           Lower is better for calibration, higher is better for ranking   \n",
              "What are the weights of \"Uncalibrated monoBERT\" tuned on?                                                                                                                                                                                                                                    MSMarco   \n",
              "What category was used to build and train literal explanation + BERT? what does this category mean?                                                                                                                               Category F: training nle-based neural rankers on calibration data.   \n",
              "Is the 'trec-dl' in table 1 the same as the 'trec' in table 2?                                                                                                                                                                                                                                   Yes   \n",
              "\n",
              "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    control  \\\n",
              "How does the average query length compare to the average document length in table 1?                                                                                                                                                                                                                                                                                                                                                                                                The average query length is shorter compared to the average document length in Table 1.   \n",
              "What are the Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2?                                                                                                                                                                                                                                                                                                                                                                                                                                                                0.529 and 0.534   \n",
              "What impact do natural language explanations (NLEs) have on improving the calibration and overall effectiveness of these models in document ranking tasks?  Natural language explanations (NLEs) significantly enhance the scale calibration of neural rankers, often maintaining or even boosting ranking performance in most scenarios. They provide valuable insights for document differentiation and lead to statistically significant improvements in scale calibration, resulting in lower calibration error values compared to other calibration methods. Additi...   \n",
              "How do i interpret table 2's calibration and ranking scores?                                                                                                Interpreting Table 2's calibration and ranking scores involves understanding that lower values are better for calibration metrics such as CB-ECE, ECE, and MSE. The table compares the performance of different methods on two scale calibration datasets, TREC and NTCIR, using metrics like nDCG, nDCG@10, CB-ECE, ECE, and MSE. Significant improvements over a baseline method are marked with a dagger ...   \n",
              "What are the weights of \"Uncalibrated monoBERT\" tuned on?                                                                                                                                                                                                                                                                                                                                                                                                                                                The weights of \"Uncalibrated monoBERT\" are fine-tuned on MS MARCO.   \n",
              "What category was used to build and train literal explanation + BERT? what does this category mean?                                                         Category F was used to build and train the literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. Specifically, in the scenario of the literal explanation approach, where each input is represented with two meta NLEs (one for relevance an...   \n",
              "Is the 'trec-dl' in table 1 the same as the 'trec' in table 2?                                                                                                                                                                                                                                                                                                                                                                                                                                          Yes, the 'trec-dl' in Table 1 is the same as the 'TREC' in Table 2.   \n",
              "\n",
              "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         v1  \n",
              "How does the average query length compare to the average document length in table 1?                                                                                                                                                                                                                                                                                                                                                                                                The average query length is shorter compared to the average document length in table 1.  \n",
              "What are the Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2?                                                                                                                                                                                                                                                                                                                             The Literal Explanation + BERT method's nDCG@10 scores on both datasets in Table 2 are 0.529 for the TREC dataset and 0.602 for the NTCIR dataset.  \n",
              "What impact do natural language explanations (NLEs) have on improving the calibration and overall effectiveness of these models in document ranking tasks?  Natural language explanations (NLEs) significantly enhance the scale calibration of neural rankers, often maintaining or even boosting ranking performance in most scenarios. They provide valuable insights for document differentiation and lead to statistically significant improvements in scale calibration, resulting in lower calibration error values compared to other calibration methods. Additi...  \n",
              "How do i interpret table 2's calibration and ranking scores?                                                                                                Interpret the calibration and ranking scores in Table 2 by noting that lower values are better for calibration metrics such as CB-ECE, ECE, and MSE. Statistically significant improvements over the \"Platt Scaling monoBERT\" baseline are marked with a dagger symbol. The table presents the performance of different methods on two scale calibration datasets, TREC and NTCIR, with metrics like nDCG, n...  \n",
              "What are the weights of \"Uncalibrated monoBERT\" tuned on?                                                                                                                                                                                                                                                                                                                                                                                                                                                The weights of \"Uncalibrated monoBERT\" are fine-tuned on MS MARCO.  \n",
              "What category was used to build and train literal explanation + BERT? what does this category mean?                                                         Category F was used to build and train literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. The approach includes generating and aggregating natural language explanations for query-document pairs, and then using these explanations to r...  \n",
              "Is the 'trec-dl' in table 1 the same as the 'trec' in table 2?                                                                                                                                                                                                                                                                                                                                                                                                                Yes, the 'trec-dl' in Table 1 is the same dataset as the 'TREC' dataset mentioned in Table 2.  "
            ]
          },
          "execution_count": 105,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# Add variant 1's responses to `exp_results` dataframe:\n",
        "\n",
        "v1_responses = [one_v1, two_v1, three_v1, four_v1, five_v1, six_v1, seven_v1]\n",
        "\n",
        "exp_results['v1'] = v1_responses\n",
        "\n",
        "exp_results"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "zUdTj2IiXEwH"
      },
      "source": [
        "## Variant 2: concatenate row values with header data"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 58,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "DmVEUZrMQWfa",
        "outputId": "5afa51cf-659f-4a5b-939f-8a90052f844b"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "def concat_rows_with_headers(dataframe) -> list[str, str]:\n",
        "    \"\"\"\n",
        "    For each row, for each value, concatenate it with its column header.\n",
        "    \"\"\"\n",
        "    return dataframe.apply(lambda row: ', '.join(f\"{col}: {row[col]}\" for col in dataframe.columns), axis=1).tolist()\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 59,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "33_x9zEAYyMx",
        "outputId": "3ee16e9c-92ff-4a82-f1e4-1e2821beb972"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "df1_rows_w_headers = concat_rows_with_headers(df1)\n",
        "df2_rows_w_headers = concat_rows_with_headers(df2)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 60,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 124
        },
        "id": "vrb-zZ2aQZ1-",
        "outputId": "692be796-7ded-4f4d-d6ea-faed8b49134b"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/plain": [
              "['Metric: # Queries (Train/Val/Test), TREC-DL: | 97/53/67, NTCIR-14: 48/16/16',\n",
              " 'Metric: Avg. # docs per query, TREC-DL: 282.7, NTCIR-14: 345.3',\n",
              " 'Metric: Levels of relevance, TREC-DL: 4, NTCIR-14: 5',\n",
              " 'Metric: Label dist. (low to high), TREC-DL: 58/22/14/6, NTCIR-14: 48/23/17/8/3',\n",
              " 'Metric: Avg. query length, TREC-DL: 8.0, NTCIR-14: 22.0',\n",
              " 'Metric: Avg. doc. length, TREC-DL: 70.9, NTCIR-14: 493.2']"
            ]
          },
          "execution_count": 60,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "df1_rows_w_headers"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 61,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 338
        },
        "id": "WdTJ_Y8rOQRN",
        "outputId": "3b15a393-f540-40fc-cfe3-2c3d8ac2378c"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/plain": [
              "['Method: Uncalibrated monoBERT, Category: A, TREC-ndcg: 0.799, TREC-ndcg@10: 0.494, TREC-CB-ECE: 1.205, TREC-ECE: 0.320, TREC-MSE: 0.773, NTCIR-ndcg: | 0.735, NTCIR-ndcg@10: 0.337, NTCIR-CB-ECE: 1.757, NTCIR-ECE: 0.799, NTCIR-MSE: 1.824',\n",
              " 'Method: Post hoc + monoBERT, Category: B, TREC-ndcg: 0.799, TREC-ndcg@10: 0.494, TREC-CB-ECE: 1.141, TREC-ECE: 0.125, TREC-MSE: 0.684, NTCIR-ndcg: | 0.735, NTCIR-ndcg@10: 0.337, NTCIR-CB-ECE: 1.624, NTCIR-ECE: 0.457, NTCIR-MSE: 1.462',\n",
              " 'Method: Finetune monoBERT, Category: C, TREC-ndcg: 0.776, TREC-ndcg@10: 0.422, TREC-CB-ECE: 1.093, TREC-ECE: 0.221, TREC-MSE: -~\u2014\u00ab0.721 |, NTCIR-ndcg: 0.696, NTCIR-ndcg@10: 0.268, NTCIR-CB-ECE: 1.843, NTCIR-ECE: 0.709, NTCIR-MSE: \u20181.874',\n",
              " 'Method: Finetune BERT, Category: C, TREC-ndcg: 0.738, TREC-ndcg@10: 0.327, TREC-CB-ECE: 1.253, TREC-ECE: 0.266, TREC-MSE: ~=\u20140.785_ |, NTCIR-ndcg: _ 0.727, NTCIR-ndcg@10: 0.285, NTCIR-CB-ECE: 1.756, NTCIR-ECE: 0.546, NTCIR-MSE: \u00ab1.416',\n",
              " \"Method: LLM prompting w/ rubrics, Category: D, TREC-ndcg: 0.786, TREC-ndcg@10: 0.457, TREC-CB-ECE: 1.000', TREC-ECE: 1.246, TREC-MSE: \u00ab2.137, NTCIR-ndcg: | 0.728, NTCIR-ndcg@10: 0.328, NTCIR-CB-ECE: 1.2947, NTCIR-ECE: 1.194, NTCIR-MSE: 2.773\",\n",
              " 'Method: Post hoc + MC Sampling LLM, Category: E, TREC-ndcg: 0.790, TREC-ndcg@10: 0.473, TREC-CB-ECE: 1.165, TREC-ECE: 0.145, TREC-MSE: 0.673, NTCIR-ndcg: | 0.736, NTCIR-ndcg@10: 0.364\", NTCIR-CB-ECE: 1.677, NTCIR-ECE: 0.472, NTCIR-MSE: \u20181.540',\n",
              " 'Method: Literal Explanation + BERT, Category: F, TREC-ndcg: 0.815*, TREC-ndcg@10: 0.529%, TREC-CB-ECE: 0.996\u00b0, TREC-ECE: 0.067*, TREC-MSE: 0.602\" |, NTCIR-ndcg: 0.742, NTCIR-ndcg@10: 0.340, NTCIR-CB-ECE: 1.534\", NTCIR-ECE: 0.355, NTCIR-MSE: 1.3307',\n",
              " \"Method: Conditional Explanation + BERT, Category: F, TREC-ndcg: 0.822, TREC-ndcg@10: \u20140.534*, TREC-CB-ECE: 0.862', TREC-ECE: 0.428, TREC-MSE: ~\u20140.832_ |, NTCIR-ndcg: 0.720, NTCIR-ndcg@10: 0.322, NTCIR-CB-ECE: 1.405', NTCIR-ECE: 0.2577, NTCIR-MSE: 1.2907\"]"
            ]
          },
          "execution_count": 61,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "df2_rows_w_headers"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 62,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "ctn2yV1LaxhQ",
        "outputId": "840f50ba-8bbd-4601-abf8-0fbf916969df"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# Now turn your items into Document objects like before, so they can go into the indexing pipeline downstream\n",
        "df1_rows_w_headers_docs = turn_data_into_documents(df1_rows_w_headers)\n",
        "df2_rows_w_headers_docs = turn_data_into_documents(df2_rows_w_headers)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 63,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 88
        },
        "id": "Oxids7R9axmZ",
        "outputId": "0d7b2a6c-9178-4912-e9f0-f65d2ecf9118"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/plain": [
              "Document(id_='cb245e9f-1204-498e-bafc-f0ef4f493202', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='Metric: # Queries (Train/Val/Test), TREC-DL: | 97/53/67, NTCIR-14: 48/16/16', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n')"
            ]
          },
          "execution_count": 63,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# Preview one of your Document objs\n",
        "df1_rows_w_headers_docs[0]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "7L0wTy-pY5E9"
      },
      "source": [
        "### Run indexing pipeline"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 64,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 49,
          "referenced_widgets": [
            "4269a5bbe88f4d33a13ecc578dc1f085",
            "f78c83caaa06480a880ceb1cf2185e45",
            "9c0db9cc9956419e804784dc29504a59",
            "f07db6245eb142e4818ccf8676fa1f33",
            "b4c4620618e44397b1949c4145a4db00",
            "f9bc5de70c7f4a0ea70dc2b821493005",
            "b9860cf4971b4658a94453049dbc4ee9",
            "83156f5298eb49509aaaac3c52977fc3",
            "14ee353104b345b3bb128094f985bfd6",
            "d56cb337108d4309afa9a020815bb079",
            "a5ab05f9bbd94ef8a69c27739d301a67"
          ]
        },
        "id": "RjPyIkjdv6Mr",
        "outputId": "3957f480-7b69-4fca-86f1-f855f69f2760"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "4269a5bbe88f4d33a13ecc578dc1f085",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "Upserted vectors:   0%|          | 0/63 [00:00<?, ?it/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# Declare namespace\n",
        "v2_namespace = 'v2'\n",
        "\n",
        "# Initialize vector store w/v2 namespace\n",
        "v2_vector_store = initialize_vector_store(pinecone_index, v2_namespace)\n",
        "\n",
        "# Set up your docs\n",
        "v2_docs = df1_rows_w_headers_docs + df2_rows_w_headers_docs + ctrl_docs\n",
        "\n",
        "# Run your pipeline\n",
        "output = run_indexing_pipeline(v2_vector_store, v2_docs)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 65,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 124
        },
        "id": "e_hSiz2hAFzl",
        "outputId": "53c12e89-a19e-43da-ce0a-94a7900c0f94"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/plain": [
              "{'dimension': 1536,\n",
              " 'index_fullness': 0.0,\n",
              " 'namespaces': {'control': {'vector_count': 48},\n",
              "                'v1': {'vector_count': 63},\n",
              "                'v2': {'vector_count': 0}},\n",
              " 'total_vector_count': 111}"
            ]
          },
          "execution_count": 65,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# Awesome, you have vectorized each row's values and column headers (per table), and\n",
        "# upserted them all into Pinecone along with the 'control' vectors.\n",
        "\n",
        "pinecone_index.describe_index_stats()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "h6lzrxbwZMX5"
      },
      "source": [
        "### Run RAG pipeline"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 106,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "fFyaM7UtZgLT",
        "outputId": "11174acc-d4f0-43ba-ba87-af14dce0453e"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# Run RAG pipeline for v2 use case\n",
        "one_v2, two_v2, three_v2, four_v2, five_v2, six_v2, seven_v2 = run_rag_pipeline(v2_vector_store, QUERIES)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 107,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000
        },
        "id": "6YLIVVj-Z2JT",
        "outputId": "6947c5bc-f92a-4163-eea9-10402bbff65f"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.google.colaboratory.intrinsic+json": {
              "summary": "{\n  \"name\": \"exp_results\",\n  \"rows\": 7,\n  \"fields\": [\n    {\n      \"column\": \"ANSWER\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 7,\n        \"samples\": [\n          \"The average query length is shorter in TREC (8) than it is in NTCIR (22). The average doc length is also shorter in TREC (70.9 vs 493.2)\",\n          \"TREC: 0.529; NTCIR: 0.340.\",\n          \"Category F: training nle-based neural rankers on calibration data.\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"control\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 7,\n        \"samples\": [\n          \"The average query length is shorter compared to the average document length in Table 1.\",\n          \"0.529 and 0.534\",\n          \"Category F was used to build and train the literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. Specifically, in the scenario of the literal explanation approach, where each input is represented with two meta NLEs (one for relevance and one for non-relevance), an additional processing step is involved. The hidden states obtained from encoding both NLEs are concatenated, and this concatenated representation is fed into an additional linear layer to transform these combined hidden states into a final ranking score.\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"v1\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 7,\n        \"samples\": [\n          \"The average query length is shorter compared to the average document length in table 1.\",\n          \"The Literal Explanation + BERT method's nDCG@10 scores on both datasets in Table 2 are 0.529 for the TREC dataset and 0.602 for the NTCIR dataset.\",\n          \"Category F was used to build and train literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. The approach includes generating and aggregating natural language explanations for query-document pairs, and then using these explanations to refine the ranking scores of the neural ranker.\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"v2\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 7,\n        \"samples\": [\n          \"The average query length is significantly lower than the average document length in table 1.\",\n          \"The Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2 are 0.529% for the TREC dataset and 0.340 for the NTCIR dataset.\",\n          \"Category F was used to build and train literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. For the literal explanation approach, this category involves generating and aggregating natural language explanations for query-document pairs to enhance the ranking performance of neural rankers.\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}",
              "type": "dataframe",
              "variable_name": "exp_results"
            },
            "text/html": [
              "\n",
              "  <div id=\"df-0d72535c-e62c-4144-a065-dad1c56cbdfa\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>ANSWER</th>\n",
              "      <th>control</th>\n",
              "      <th>v1</th>\n",
              "      <th>v2</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>How does the average query length compare to the average document length in table 1?</th>\n",
              "      <td>The average query length is shorter in TREC (8) than it is in NTCIR (22). The average doc length is also shorter in TREC (70.9 vs 493.2)</td>\n",
              "      <td>The average query length is shorter compared to the average document length in Table 1.</td>\n",
              "      <td>The average query length is shorter compared to the average document length in table 1.</td>\n",
              "      <td>The average query length is significantly lower than the average document length in table 1.</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>What are the Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2?</th>\n",
              "      <td>TREC: 0.529; NTCIR: 0.340.</td>\n",
              "      <td>0.529 and 0.534</td>\n",
              "      <td>The Literal Explanation + BERT method's nDCG@10 scores on both datasets in Table 2 are 0.529 for the TREC dataset and 0.602 for the NTCIR dataset.</td>\n",
              "      <td>The Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2 are 0.529% for the TREC dataset and 0.340 for the NTCIR dataset.</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>What impact do natural language explanations (NLEs) have on improving the calibration and overall effectiveness of these models in document ranking tasks?</th>\n",
              "      <td>NLEs lead to better calibrated neural rankers while maintaining or even boosting ranking performance in most scenarios</td>\n",
              "      <td>Natural language explanations (NLEs) significantly enhance the scale calibration of neural rankers, often maintaining or even boosting ranking performance in most scenarios. They provide valuable insights for document differentiation and lead to statistically significant improvements in scale calibration, resulting in lower calibration error values compared to other calibration methods. Additi...</td>\n",
              "      <td>Natural language explanations (NLEs) significantly enhance the scale calibration of neural rankers, often maintaining or even boosting ranking performance in most scenarios. They provide valuable insights for document differentiation and lead to statistically significant improvements in scale calibration, resulting in lower calibration error values compared to other calibration methods. Additi...</td>\n",
              "      <td>Natural language explanations (NLEs) significantly enhance the scale calibration of neural rankers, often maintaining or even boosting ranking performance in most scenarios. They provide valuable insights for document differentiation and lead to statistically significant improvements in scale calibration, resulting in lower calibration error values compared to other calibration methods. Additi...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>How do i interpret table 2's calibration and ranking scores?</th>\n",
              "      <td>Lower is better for calibration, higher is better for ranking</td>\n",
              "      <td>Interpreting Table 2's calibration and ranking scores involves understanding that lower values are better for calibration metrics such as CB-ECE, ECE, and MSE. The table compares the performance of different methods on two scale calibration datasets, TREC and NTCIR, using metrics like nDCG, nDCG@10, CB-ECE, ECE, and MSE. Significant improvements over a baseline method are marked with a dagger ...</td>\n",
              "      <td>Interpret the calibration and ranking scores in Table 2 by noting that lower values are better for calibration metrics such as CB-ECE, ECE, and MSE. Statistically significant improvements over the \"Platt Scaling monoBERT\" baseline are marked with a dagger symbol. The table presents the performance of different methods on two scale calibration datasets, TREC and NTCIR, with metrics like nDCG, n...</td>\n",
              "      <td>Interpret the calibration and ranking scores in Table 2 by noting that lower values are better for calibration metrics such as CB-ECE, ECE, and MSE. Look for statistically significant improvements over the \"Platt Scaling monoBERT\" baseline, which are marked with a dagger symbol (\u2020). Pay attention to the values in the columns for Ranking and Calibration metrics for different methods and dataset...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>What are the weights of \"Uncalibrated monoBERT\" tuned on?</th>\n",
              "      <td>MSMarco</td>\n",
              "      <td>The weights of \"Uncalibrated monoBERT\" are fine-tuned on MS MARCO.</td>\n",
              "      <td>The weights of \"Uncalibrated monoBERT\" are fine-tuned on MS MARCO.</td>\n",
              "      <td>The weights of \"Uncalibrated monoBERT\" are fine-tuned on MS MARCO.</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>What category was used to build and train literal explanation + BERT? what does this category mean?</th>\n",
              "      <td>Category F: training nle-based neural rankers on calibration data.</td>\n",
              "      <td>Category F was used to build and train the literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. Specifically, in the scenario of the literal explanation approach, where each input is represented with two meta NLEs (one for relevance an...</td>\n",
              "      <td>Category F was used to build and train literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. The approach includes generating and aggregating natural language explanations for query-document pairs, and then using these explanations to r...</td>\n",
              "      <td>Category F was used to build and train literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. For the literal explanation approach, this category involves generating and aggregating natural language explanations for query-document pairs ...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>Is the 'trec-dl' in table 1 the same as the 'trec' in table 2?</th>\n",
              "      <td>Yes</td>\n",
              "      <td>Yes, the 'trec-dl' in Table 1 is the same as the 'TREC' in Table 2.</td>\n",
              "      <td>Yes, the 'trec-dl' in Table 1 is the same dataset as the 'TREC' dataset mentioned in Table 2.</td>\n",
              "      <td>Yes, the 'trec-dl' in table 1 is the same as the 'TREC' in table 2.</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-0d72535c-e62c-4144-a065-dad1c56cbdfa')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-0d72535c-e62c-4144-a065-dad1c56cbdfa button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-0d72535c-e62c-4144-a065-dad1c56cbdfa');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "<div id=\"df-14f77d7e-061f-465d-bb18-00de69675271\">\n",
              "  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-14f77d7e-061f-465d-bb18-00de69675271')\"\n",
              "            title=\"Suggest charts\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "     width=\"24px\">\n",
              "    <g>\n",
              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
              "    </g>\n",
              "</svg>\n",
              "  </button>\n",
              "\n",
              "<style>\n",
              "  .colab-df-quickchart {\n",
              "      --bg-color: #E8F0FE;\n",
              "      --fill-color: #1967D2;\n",
              "      --hover-bg-color: #E2EBFA;\n",
              "      --hover-fill-color: #174EA6;\n",
              "      --disabled-fill-color: #AAA;\n",
              "      --disabled-bg-color: #DDD;\n",
              "  }\n",
              "\n",
              "  [theme=dark] .colab-df-quickchart {\n",
              "      --bg-color: #3B4455;\n",
              "      --fill-color: #D2E3FC;\n",
              "      --hover-bg-color: #434B5C;\n",
              "      --hover-fill-color: #FFFFFF;\n",
              "      --disabled-bg-color: #3B4455;\n",
              "      --disabled-fill-color: #666;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart {\n",
              "    background-color: var(--bg-color);\n",
              "    border: none;\n",
              "    border-radius: 50%;\n",
              "    cursor: pointer;\n",
              "    display: none;\n",
              "    fill: var(--fill-color);\n",
              "    height: 32px;\n",
              "    padding: 0;\n",
              "    width: 32px;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart:hover {\n",
              "    background-color: var(--hover-bg-color);\n",
              "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "    fill: var(--button-hover-fill-color);\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart-complete:disabled,\n",
              "  .colab-df-quickchart-complete:disabled:hover {\n",
              "    background-color: var(--disabled-bg-color);\n",
              "    fill: var(--disabled-fill-color);\n",
              "    box-shadow: none;\n",
              "  }\n",
              "\n",
              "  .colab-df-spinner {\n",
              "    border: 2px solid var(--fill-color);\n",
              "    border-color: transparent;\n",
              "    border-bottom-color: var(--fill-color);\n",
              "    animation:\n",
              "      spin 1s steps(1) infinite;\n",
              "  }\n",
              "\n",
              "  @keyframes spin {\n",
              "    0% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "      border-left-color: var(--fill-color);\n",
              "    }\n",
              "    20% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    30% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    40% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    60% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    80% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "    90% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "  }\n",
              "</style>\n",
              "\n",
              "  <script>\n",
              "    async function quickchart(key) {\n",
              "      const quickchartButtonEl =\n",
              "        document.querySelector('#' + key + ' button');\n",
              "      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
              "      quickchartButtonEl.classList.add('colab-df-spinner');\n",
              "      try {\n",
              "        const charts = await google.colab.kernel.invokeFunction(\n",
              "            'suggestCharts', [key], {});\n",
              "      } catch (error) {\n",
              "        console.error('Error during call to suggestCharts:', error);\n",
              "      }\n",
              "      quickchartButtonEl.classList.remove('colab-df-spinner');\n",
              "      quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
              "    }\n",
              "    (() => {\n",
              "      let quickchartButtonEl =\n",
              "        document.querySelector('#df-14f77d7e-061f-465d-bb18-00de69675271 button');\n",
              "      quickchartButtonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "    })();\n",
              "  </script>\n",
              "</div>\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "text/plain": [
              "                                                                                                                                                                                                                                                                                              ANSWER  \\\n",
              "How does the average query length compare to the average document length in table 1?                                                                        The average query length is shorter in TREC (8) than it is in NTCIR (22). The average doc length is also shorter in TREC (70.9 vs 493.2)   \n",
              "What are the Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2?                                                                                                                                                                              TREC: 0.529; NTCIR: 0.340.   \n",
              "What impact do natural language explanations (NLEs) have on improving the calibration and overall effectiveness of these models in document ranking tasks?                    NLEs lead to better calibrated neural rankers while maintaining or even boosting ranking performance in most scenarios   \n",
              "How do i interpret table 2's calibration and ranking scores?                                                                                                                                                                           Lower is better for calibration, higher is better for ranking   \n",
              "What are the weights of \"Uncalibrated monoBERT\" tuned on?                                                                                                                                                                                                                                    MSMarco   \n",
              "What category was used to build and train literal explanation + BERT? what does this category mean?                                                                                                                               Category F: training nle-based neural rankers on calibration data.   \n",
              "Is the 'trec-dl' in table 1 the same as the 'trec' in table 2?                                                                                                                                                                                                                                   Yes   \n",
              "\n",
              "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    control  \\\n",
              "How does the average query length compare to the average document length in table 1?                                                                                                                                                                                                                                                                                                                                                                                                The average query length is shorter compared to the average document length in Table 1.   \n",
              "What are the Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2?                                                                                                                                                                                                                                                                                                                                                                                                                                                                0.529 and 0.534   \n",
              "What impact do natural language explanations (NLEs) have on improving the calibration and overall effectiveness of these models in document ranking tasks?  Natural language explanations (NLEs) significantly enhance the scale calibration of neural rankers, often maintaining or even boosting ranking performance in most scenarios. They provide valuable insights for document differentiation and lead to statistically significant improvements in scale calibration, resulting in lower calibration error values compared to other calibration methods. Additi...   \n",
              "How do i interpret table 2's calibration and ranking scores?                                                                                                Interpreting Table 2's calibration and ranking scores involves understanding that lower values are better for calibration metrics such as CB-ECE, ECE, and MSE. The table compares the performance of different methods on two scale calibration datasets, TREC and NTCIR, using metrics like nDCG, nDCG@10, CB-ECE, ECE, and MSE. Significant improvements over a baseline method are marked with a dagger ...   \n",
              "What are the weights of \"Uncalibrated monoBERT\" tuned on?                                                                                                                                                                                                                                                                                                                                                                                                                                                The weights of \"Uncalibrated monoBERT\" are fine-tuned on MS MARCO.   \n",
              "What category was used to build and train literal explanation + BERT? what does this category mean?                                                         Category F was used to build and train the literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. Specifically, in the scenario of the literal explanation approach, where each input is represented with two meta NLEs (one for relevance an...   \n",
              "Is the 'trec-dl' in table 1 the same as the 'trec' in table 2?                                                                                                                                                                                                                                                                                                                                                                                                                                          Yes, the 'trec-dl' in Table 1 is the same as the 'TREC' in Table 2.   \n",
              "\n",
              "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         v1  \\\n",
              "How does the average query length compare to the average document length in table 1?                                                                                                                                                                                                                                                                                                                                                                                                The average query length is shorter compared to the average document length in table 1.   \n",
              "What are the Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2?                                                                                                                                                                                                                                                                                                                             The Literal Explanation + BERT method's nDCG@10 scores on both datasets in Table 2 are 0.529 for the TREC dataset and 0.602 for the NTCIR dataset.   \n",
              "What impact do natural language explanations (NLEs) have on improving the calibration and overall effectiveness of these models in document ranking tasks?  Natural language explanations (NLEs) significantly enhance the scale calibration of neural rankers, often maintaining or even boosting ranking performance in most scenarios. They provide valuable insights for document differentiation and lead to statistically significant improvements in scale calibration, resulting in lower calibration error values compared to other calibration methods. Additi...   \n",
              "How do i interpret table 2's calibration and ranking scores?                                                                                                Interpret the calibration and ranking scores in Table 2 by noting that lower values are better for calibration metrics such as CB-ECE, ECE, and MSE. Statistically significant improvements over the \"Platt Scaling monoBERT\" baseline are marked with a dagger symbol. The table presents the performance of different methods on two scale calibration datasets, TREC and NTCIR, with metrics like nDCG, n...   \n",
              "What are the weights of \"Uncalibrated monoBERT\" tuned on?                                                                                                                                                                                                                                                                                                                                                                                                                                                The weights of \"Uncalibrated monoBERT\" are fine-tuned on MS MARCO.   \n",
              "What category was used to build and train literal explanation + BERT? what does this category mean?                                                         Category F was used to build and train literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. The approach includes generating and aggregating natural language explanations for query-document pairs, and then using these explanations to r...   \n",
              "Is the 'trec-dl' in table 1 the same as the 'trec' in table 2?                                                                                                                                                                                                                                                                                                                                                                                                                Yes, the 'trec-dl' in Table 1 is the same dataset as the 'TREC' dataset mentioned in Table 2.   \n",
              "\n",
              "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         v2  \n",
              "How does the average query length compare to the average document length in table 1?                                                                                                                                                                                                                                                                                                                                                                                           The average query length is significantly lower than the average document length in table 1.  \n",
              "What are the Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2?                                                                                                                                                                                                                                                                                                                            The Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2 are 0.529% for the TREC dataset and 0.340 for the NTCIR dataset.  \n",
              "What impact do natural language explanations (NLEs) have on improving the calibration and overall effectiveness of these models in document ranking tasks?  Natural language explanations (NLEs) significantly enhance the scale calibration of neural rankers, often maintaining or even boosting ranking performance in most scenarios. They provide valuable insights for document differentiation and lead to statistically significant improvements in scale calibration, resulting in lower calibration error values compared to other calibration methods. Additi...  \n",
              "How do i interpret table 2's calibration and ranking scores?                                                                                                Interpret the calibration and ranking scores in Table 2 by noting that lower values are better for calibration metrics such as CB-ECE, ECE, and MSE. Look for statistically significant improvements over the \"Platt Scaling monoBERT\" baseline, which are marked with a dagger symbol (\u2020). Pay attention to the values in the columns for Ranking and Calibration metrics for different methods and dataset...  \n",
              "What are the weights of \"Uncalibrated monoBERT\" tuned on?                                                                                                                                                                                                                                                                                                                                                                                                                                                The weights of \"Uncalibrated monoBERT\" are fine-tuned on MS MARCO.  \n",
              "What category was used to build and train literal explanation + BERT? what does this category mean?                                                         Category F was used to build and train literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. For the literal explanation approach, this category involves generating and aggregating natural language explanations for query-document pairs ...  \n",
              "Is the 'trec-dl' in table 1 the same as the 'trec' in table 2?                                                                                                                                                                                                                                                                                                                                                                                                                                          Yes, the 'trec-dl' in table 1 is the same as the 'TREC' in table 2.  "
            ]
          },
          "execution_count": 107,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# Add variant 2's responses to `exp_results` dataframe:\n",
        "v2_responses = [one_v2, two_v2, three_v2, four_v2, five_v2, six_v2, seven_v2]\n",
        "\n",
        "exp_results['v2'] = v2_responses\n",
        "\n",
        "exp_results"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ytxUQ7srMHDR"
      },
      "source": [
        "## Variant 3: Concatenate row values w/header data *and* table description data\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 68,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "51-nnjUWLL1r",
        "outputId": "21d9a090-dfc9-4e0e-8cdc-118f630b5154"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# Make function to extract table descriptions from the PDF\n",
        "# Note: this function includes transforming the table descriptions into LlamaIndex Document objs, so no need to do that later\n",
        "\n",
        "def extract_table_description(docs: list[Document], start_phrase: str, end_phrase: str) -> list[str]:\n",
        "    \"\"\"\n",
        "    Extract descriptions of embedded tables.\n",
        "\n",
        "    :param docs: LlamaIndex documents you want to search through to find table descriptions.\n",
        "    :param start_phrase: The starting boundary of your table descriptoin (inclusive).\n",
        "    :param end_phrase: The ending boundary of your table descriptoin (inclusive).\n",
        "    \"\"\"\n",
        "    pattern = fr\"(.|^)({start_phrase}.*?{end_phrase})\"\n",
        "    table_desc = []\n",
        "    for d in docs:\n",
        "        match = re.search(pattern, d.text, re.DOTALL)\n",
        "        if match:\n",
        "            table_desc.append(match.group(2))\n",
        "    return turn_data_into_documents(table_desc)  # Turn tables into LlamaIndex Document objs so they work in Pipeline\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 69,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "lD0TuHxUOdWE",
        "outputId": "bff89f3c-7142-40be-b8ce-bbc42fb1bdfa"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# Extract descriptions for Table 1 and Table 2, from `control_docs`\n",
        "t1_desc = extract_table_description(ctrl_docs, \"Table 1\", \"512 tokens\")\n",
        "t2_desc = extract_table_description(ctrl_docs, \"Table 2\", \"marked with\")  # Don't worry about special char in actual PDF at end of desc.\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 70,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 71
        },
        "id": "pIuKX4i1OfGS",
        "outputId": "b265a040-3a3e-42ef-e715-e4880940edfc"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            },
            "text/plain": [
              "'Table 1: Statistics of the TREC-DL 2019-2022 and NTCIR-14\\nWWW-2 Datasets. The lengths of queries and documents are\\nquantified using BERT tokenization. For the NTCIR dataset,\\ndocuments sourced from ClueWeb have undergone prepro-\\ncessing to retain only the initial 512 tokens'"
            ]
          },
          "execution_count": 70,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "t1_desc[0].text"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 71,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "6b7OsDffSS9n",
        "outputId": "63d9fe85-7b96-4c57-8921-f176be518964"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# Make deep copies of original `df1_rows_w_headers_docs`, so you don't mess past data structures, as we update them in-place below.\n",
        "copy_df1_rows_w_headers_docs = deepcopy(df1_rows_w_headers_docs)\n",
        "copy_df2_rows_w_headers_docs = deepcopy(df2_rows_w_headers_docs)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 72,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "id": "UtIlmymFyU3G",
        "outputId": "c47194ac-d1e4-40f4-ac46-5d67ab9d9cab"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            },
            "text/plain": [
              "'Metric: # Queries (Train/Val/Test), TREC-DL: | 97/53/67, NTCIR-14: 48/16/16'"
            ]
          },
          "execution_count": 72,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# Before in-place update, t1\n",
        "copy_df1_rows_w_headers_docs[0].text"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 73,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 53
        },
        "id": "kDnzFW6dyctH",
        "outputId": "32d8da82-3b1c-4f38-a867-b8f918621d51"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            },
            "text/plain": [
              "'Method: Uncalibrated monoBERT, Category: A, TREC-ndcg: 0.799, TREC-ndcg@10: 0.494, TREC-CB-ECE: 1.205, TREC-ECE: 0.320, TREC-MSE: 0.773, NTCIR-ndcg: | 0.735, NTCIR-ndcg@10: 0.337, NTCIR-CB-ECE: 1.757, NTCIR-ECE: 0.799, NTCIR-MSE: 1.824'"
            ]
          },
          "execution_count": 73,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# Before in-place update, t2\n",
        "copy_df2_rows_w_headers_docs[0].text"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 74,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "gMxLnI-_Qas2",
        "outputId": "9880a0e7-20a6-4639-c4d8-fb6a80cfb761"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# Update .text attributes of each Document obj in-place to include the extracted table descriptions:\n",
        "\n",
        "def add_in_table_description(docs: list[Document], desc: list[Document]) -> None:\n",
        "    \"\"\"\n",
        "    Add description for embedded table to Document item (in-place).\n",
        "\n",
        "    :param docs: Documents whose .text attribte you want to update with a Table description.\n",
        "    :param desc: The Table description you want to add to a Document's .text attribute.\n",
        "    \"\"\"\n",
        "    for i in range(len(docs)):\n",
        "        docs[i].text += f\". {desc[0].text}\""
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 75,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "EhOWNBIBTY5a",
        "outputId": "a80d5c5d-d19c-4b2c-fe46-5c598737ff23"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# Add Table 1 desc to Table 1 concatenated rows and headers\n",
        "add_in_table_description(copy_df1_rows_w_headers_docs, t1_desc)\n",
        "\n",
        "# Add Table 2 desc to Table 2 concatenated rows and headers\n",
        "add_in_table_description(copy_df2_rows_w_headers_docs, t2_desc)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 76,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 71
        },
        "id": "c36p4QIQyiv3",
        "outputId": "6e15b2e7-f926-4777-879e-62b6aaa2807a"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            },
            "text/plain": [
              "'Metric: # Queries (Train/Val/Test), TREC-DL: | 97/53/67, NTCIR-14: 48/16/16. Table 1: Statistics of the TREC-DL 2019-2022 and NTCIR-14\\nWWW-2 Datasets. The lengths of queries and documents are\\nquantified using BERT tokenization. For the NTCIR dataset,\\ndocuments sourced from ClueWeb have undergone prepro-\\ncessing to retain only the initial 512 tokens'"
            ]
          },
          "execution_count": 76,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# After in-place update, t1\n",
        "copy_df1_rows_w_headers_docs[0].text"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 77,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 107
        },
        "id": "KzmZWfDCyk4U",
        "outputId": "ca7ec2a4-1f10-4a2e-ea09-d7f8d52f43e7"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            },
            "text/plain": [
              "'Method: Uncalibrated monoBERT, Category: A, TREC-ndcg: 0.799, TREC-ndcg@10: 0.494, TREC-CB-ECE: 1.205, TREC-ECE: 0.320, TREC-MSE: 0.773, NTCIR-ndcg: | 0.735, NTCIR-ndcg@10: 0.337, NTCIR-CB-ECE: 1.757, NTCIR-ECE: 0.799, NTCIR-MSE: 1.824. Table 2: Ranking and scale calibration performance of baseline methods and our approaches on two scale calibration datasets\\nTREC and NTCIR. Note that lower is better with calibration metrics (CB-ECE, ECE and MSE). Statistically significant improve-\\nments over \u201cPlatt Scaling monoBERT\u201d are marked with'"
            ]
          },
          "execution_count": 77,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# After in-place update, t1\n",
        "copy_df2_rows_w_headers_docs[0].text"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 78,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "mteItXBAi8Ce",
        "outputId": "5b3834fe-779f-42ea-a79c-6f8345e1036d"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# Rename your vars so they reflect the addition of the table descriptions\n",
        "df1_rows_w_headers_desc = copy_df1_rows_w_headers_docs\n",
        "df2_rows_w_headers_desc = copy_df2_rows_w_headers_docs"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 116,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 107
        },
        "id": "_iibv2NES3Da",
        "outputId": "a8661d9b-c0e3-422e-ebb6-93c9dcab6f02"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            },
            "text/plain": [
              "'Method: Literal Explanation + BERT, Category: F, TREC-ndcg: 0.815*, TREC-ndcg@10: 0.529%, TREC-CB-ECE: 0.996\u00b0, TREC-ECE: 0.067*, TREC-MSE: 0.602\" |, NTCIR-ndcg: 0.742, NTCIR-ndcg@10: 0.340, NTCIR-CB-ECE: 1.534\", NTCIR-ECE: 0.355, NTCIR-MSE: 1.3307. Table 2: Ranking and scale calibration performance of baseline methods and our approaches on two scale calibration datasets\\nTREC and NTCIR. Note that lower is better with calibration metrics (CB-ECE, ECE and MSE). Statistically significant improve-\\nments over \u201cPlatt Scaling monoBERT\u201d are marked with'"
            ]
          },
          "execution_count": 116,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# Preview of one doc from after you added Table 1 description\n",
        "df2_rows_w_headers_desc[-2].text"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "dAfg71JLbHeP"
      },
      "source": [
        "### Run indexing pipeline"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 80,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 49,
          "referenced_widgets": [
            "4453e4001c654052884d767c0ead373b",
            "eaba9b0d0e36434e976c07c2db3fd9e8",
            "a0a3dcafd69c4642a41c87b53385406a",
            "7d9c5f17619e43f79ba5fdc37e56b10b",
            "bd58da26d2704c1ea5a599efa830dd44",
            "9f484b25cc4845e9a625cdaadaa6f663",
            "c7e33648acc042aa8df078e331bb84a0",
            "da8dd880f72e4c68880e16786ba51a3d",
            "7d564ab186c04d1ab27152054d88efe4",
            "6aaaa43a2dc2457cb3c1c3045c52df50",
            "117fd87db5e24c378f9d8d08111f58e9"
          ]
        },
        "id": "FeNnPiiEbJkR",
        "outputId": "a4132dd4-0593-4642-8ee7-97d6b349bed8"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "4453e4001c654052884d767c0ead373b",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "Upserted vectors:   0%|          | 0/76 [00:00<?, ?it/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# Declare namespace\n",
        "v3_namespace = 'v3'\n",
        "\n",
        "# Initialize vector store w/v3 namespace\n",
        "v3_vector_store = initialize_vector_store(pinecone_index, v3_namespace)\n",
        "\n",
        "# Join docs\n",
        "v3_docs = df1_rows_w_headers_desc + df2_rows_w_headers_desc + ctrl_docs\n",
        "\n",
        "# Run through embedding and indexing pipeline\n",
        "output = run_indexing_pipeline(v3_vector_store, v3_docs)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 81,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 142
        },
        "id": "8_tMoiLofe4S",
        "outputId": "70ae2ddd-fbd0-42d4-943b-a7ad2506f887"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/plain": [
              "{'dimension': 1536,\n",
              " 'index_fullness': 0.0,\n",
              " 'namespaces': {'control': {'vector_count': 48},\n",
              "                'v1': {'vector_count': 63},\n",
              "                'v2': {'vector_count': 63},\n",
              "                'v3': {'vector_count': 0}},\n",
              " 'total_vector_count': 174}"
            ]
          },
          "execution_count": 81,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "pinecone_index.describe_index_stats()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "q861D3Xt24eP"
      },
      "source": [
        "### Run RAG pipeline"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 114,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "jEY062q71w_K",
        "outputId": "5dcb727d-6b62-4918-d282-04241429255c"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# Run RAG pipeline for v3 use case\n",
        "one_v3, two_v3, three_v3, four_v3, five_v3, six_v3, seven_v3 = run_rag_pipeline(v3_vector_store, QUERIES)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 115,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000
        },
        "id": "Bxxgj8Xx1xHp",
        "outputId": "1703c8ba-9226-4614-beaa-5c676f5d1855"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.google.colaboratory.intrinsic+json": {
              "summary": "{\n  \"name\": \"exp_results\",\n  \"rows\": 7,\n  \"fields\": [\n    {\n      \"column\": \"ANSWER\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 7,\n        \"samples\": [\n          \"The average query length is shorter in TREC (8) than it is in NTCIR (22). The average doc length is also shorter in TREC (70.9 vs 493.2)\",\n          \"TREC: 0.529; NTCIR: 0.340.\",\n          \"Category F: training nle-based neural rankers on calibration data.\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"control\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 7,\n        \"samples\": [\n          \"The average query length is shorter compared to the average document length in Table 1.\",\n          \"0.529 and 0.534\",\n          \"Category F was used to build and train the literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. Specifically, in the scenario of the literal explanation approach, where each input is represented with two meta NLEs (one for relevance and one for non-relevance), an additional processing step is involved. The hidden states obtained from encoding both NLEs are concatenated, and this concatenated representation is fed into an additional linear layer to transform these combined hidden states into a final ranking score.\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"v1\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 7,\n        \"samples\": [\n          \"The average query length is shorter compared to the average document length in table 1.\",\n          \"The Literal Explanation + BERT method's nDCG@10 scores on both datasets in Table 2 are 0.529 for the TREC dataset and 0.602 for the NTCIR dataset.\",\n          \"Category F was used to build and train literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. The approach includes generating and aggregating natural language explanations for query-document pairs, and then using these explanations to refine the ranking scores of the neural ranker.\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"v2\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 7,\n        \"samples\": [\n          \"The average query length is significantly lower than the average document length in table 1.\",\n          \"The Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2 are 0.529% for the TREC dataset and 0.340 for the NTCIR dataset.\",\n          \"Category F was used to build and train literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. For the literal explanation approach, this category involves generating and aggregating natural language explanations for query-document pairs to enhance the ranking performance of neural rankers.\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"v3\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 7,\n        \"samples\": [\n          \"The average query length is shorter than the average document length in Table 1.\",\n          \"The Literal Explanation + BERT method's ndcg@10 scores on the TREC dataset and NTCIR dataset in table 2 are 0.529% and 0.340%, respectively.\",\n          \"Category F was used to build and train the literal explanation + BERT model. This category involves training NLE-based neural rankers on calibration data. In this approach, a BERT model is finetuned to process meta natural language explanations (NLEs) and generate scale-calibrated ranking scores.\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"v4\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 7,\n        \"samples\": [\n          \"The average query length is significantly lower than the average document length in table 1.\",\n          \"The Literal Explanation + BERT method's nDCG@10 score on the TREC dataset is 0.529% and on the NTCIR dataset is 0.340.\",\n          \"Category F was used to build and train the \\\"Literal Explanation + BERT\\\" method. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and generate scale-calibrated ranking scores. The literal explanation approach utilizes natural language explanations for query-document pairs to enhance the ranking performance of neural rankers.\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}",
              "type": "dataframe",
              "variable_name": "exp_results"
            },
            "text/html": [
              "\n",
              "  <div id=\"df-4f9e2984-4bc9-4504-9d4a-7ab6719e6f4d\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>ANSWER</th>\n",
              "      <th>control</th>\n",
              "      <th>v1</th>\n",
              "      <th>v2</th>\n",
              "      <th>v3</th>\n",
              "      <th>v4</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>How does the average query length compare to the average document length in table 1?</th>\n",
              "      <td>The average query length is shorter in TREC (8) than it is in NTCIR (22). The average doc length is also shorter in TREC (70.9 vs 493.2)</td>\n",
              "      <td>The average query length is shorter compared to the average document length in Table 1.</td>\n",
              "      <td>The average query length is shorter compared to the average document length in table 1.</td>\n",
              "      <td>The average query length is significantly lower than the average document length in table 1.</td>\n",
              "      <td>The average query length is shorter than the average document length in Table 1.</td>\n",
              "      <td>The average query length is significantly lower than the average document length in table 1.</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>What are the Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2?</th>\n",
              "      <td>TREC: 0.529; NTCIR: 0.340.</td>\n",
              "      <td>0.529 and 0.534</td>\n",
              "      <td>The Literal Explanation + BERT method's nDCG@10 scores on both datasets in Table 2 are 0.529 for the TREC dataset and 0.602 for the NTCIR dataset.</td>\n",
              "      <td>The Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2 are 0.529% for the TREC dataset and 0.340 for the NTCIR dataset.</td>\n",
              "      <td>The Literal Explanation + BERT method's ndcg@10 scores on the TREC dataset and NTCIR dataset in table 2 are 0.529% and 0.340%, respectively.</td>\n",
              "      <td>The Literal Explanation + BERT method's nDCG@10 score on the TREC dataset is 0.529% and on the NTCIR dataset is 0.340.</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>What impact do natural language explanations (NLEs) have on improving the calibration and overall effectiveness of these models in document ranking tasks?</th>\n",
              "      <td>NLEs lead to better calibrated neural rankers while maintaining or even boosting ranking performance in most scenarios</td>\n",
              "      <td>Natural language explanations (NLEs) significantly enhance the scale calibration of neural rankers, often maintaining or even boosting ranking performance in most scenarios. They provide valuable insights for document differentiation and lead to statistically significant improvements in scale calibration, resulting in lower calibration error values compared to other calibration methods. Additi...</td>\n",
              "      <td>Natural language explanations (NLEs) significantly enhance the scale calibration of neural rankers, often maintaining or even boosting ranking performance in most scenarios. They provide valuable insights for document differentiation and lead to statistically significant improvements in scale calibration, resulting in lower calibration error values compared to other calibration methods. Additi...</td>\n",
              "      <td>Natural language explanations (NLEs) significantly enhance the scale calibration of neural rankers, often maintaining or even boosting ranking performance in most scenarios. They provide valuable insights for document differentiation and lead to statistically significant improvements in scale calibration, resulting in lower calibration error values compared to other calibration methods. Additi...</td>\n",
              "      <td>Natural language explanations (NLEs) significantly enhance the scale calibration of neural rankers, often maintaining or even boosting ranking performance in most scenarios. They provide valuable insights for document differentiation and lead to statistically significant improvements in scale calibration, exhibiting lower error values compared to other calibration methods. Additionally, NLEs c...</td>\n",
              "      <td>Natural language explanations (NLEs) significantly enhance the scale calibration of neural rankers, often maintaining or even boosting ranking performance in most scenarios. They provide valuable insights for document differentiation and lead to statistically significant improvements in scale calibration, exhibiting lower error values compared to other calibration methods. Additionally, NLEs h...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>How do i interpret table 2's calibration and ranking scores?</th>\n",
              "      <td>Lower is better for calibration, higher is better for ranking</td>\n",
              "      <td>Interpreting Table 2's calibration and ranking scores involves understanding that lower values are better for calibration metrics such as CB-ECE, ECE, and MSE. The table compares the performance of different methods on two scale calibration datasets, TREC and NTCIR, using metrics like nDCG, nDCG@10, CB-ECE, ECE, and MSE. Significant improvements over a baseline method are marked with a dagger ...</td>\n",
              "      <td>Interpret the calibration and ranking scores in Table 2 by noting that lower values are better for calibration metrics such as CB-ECE, ECE, and MSE. Statistically significant improvements over the \"Platt Scaling monoBERT\" baseline are marked with a dagger symbol. The table presents the performance of different methods on two scale calibration datasets, TREC and NTCIR, with metrics like nDCG, n...</td>\n",
              "      <td>Interpret the calibration and ranking scores in Table 2 by noting that lower values are better for calibration metrics such as CB-ECE, ECE, and MSE. Look for statistically significant improvements over the \"Platt Scaling monoBERT\" baseline, which are marked with a dagger symbol (\u2020). Pay attention to the values in the columns for Ranking and Calibration metrics for different methods and dataset...</td>\n",
              "      <td>The calibration scores in Table 2, represented by CB-ECE, ECE, and MSE, are metrics used to evaluate the alignment of the ranking scores produced by different methods with the target scale. Lower values for these metrics indicate better calibration performance. On the other hand, the ranking scores, such as nDCG and nDCG@10, measure the effectiveness of the ranking produced by the methods, whe...</td>\n",
              "      <td>Interpret the calibration and ranking scores in Table 2 by noting that lower values are better for calibration metrics such as CB-ECE, ECE, and MSE. Look for statistically significant improvements over the \"Platt Scaling monoBERT\" baseline, which are marked with a dagger symbol (\u2020). Pay attention to the values in the columns for Ranking and Calibration metrics for different methods and dataset...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>What are the weights of \"Uncalibrated monoBERT\" tuned on?</th>\n",
              "      <td>MSMarco</td>\n",
              "      <td>The weights of \"Uncalibrated monoBERT\" are fine-tuned on MS MARCO.</td>\n",
              "      <td>The weights of \"Uncalibrated monoBERT\" are fine-tuned on MS MARCO.</td>\n",
              "      <td>The weights of \"Uncalibrated monoBERT\" are fine-tuned on MS MARCO.</td>\n",
              "      <td>The weights of \"Uncalibrated monoBERT\" are fine-tuned on MS MARCO.</td>\n",
              "      <td>The weights of \"Uncalibrated monoBERT\" are fine-tuned on MS MARCO.</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>What category was used to build and train literal explanation + BERT? what does this category mean?</th>\n",
              "      <td>Category F: training nle-based neural rankers on calibration data.</td>\n",
              "      <td>Category F was used to build and train the literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. Specifically, in the scenario of the literal explanation approach, where each input is represented with two meta NLEs (one for relevance an...</td>\n",
              "      <td>Category F was used to build and train literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. The approach includes generating and aggregating natural language explanations for query-document pairs, and then using these explanations to r...</td>\n",
              "      <td>Category F was used to build and train literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. For the literal explanation approach, this category involves generating and aggregating natural language explanations for query-document pairs ...</td>\n",
              "      <td>Category F was used to build and train the literal explanation + BERT model. This category involves training NLE-based neural rankers on calibration data. In this approach, a BERT model is finetuned to process meta natural language explanations (NLEs) and generate scale-calibrated ranking scores.</td>\n",
              "      <td>Category F was used to build and train the \"Literal Explanation + BERT\" method. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and generate scale-calibrated ranking scores. The literal explanation approach utilizes natural language explanations for query-document pairs to enhance the ranking performan...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>Is the 'trec-dl' in table 1 the same as the 'trec' in table 2?</th>\n",
              "      <td>Yes</td>\n",
              "      <td>Yes, the 'trec-dl' in Table 1 is the same as the 'TREC' in Table 2.</td>\n",
              "      <td>Yes, the 'trec-dl' in Table 1 is the same dataset as the 'TREC' dataset mentioned in Table 2.</td>\n",
              "      <td>Yes, the 'trec-dl' in table 1 is the same as the 'TREC' in table 2.</td>\n",
              "      <td>The 'trec-dl' in Table 1 is not the same as the 'trec' in Table 2.</td>\n",
              "      <td>No, the 'trec-dl' in table 1 is not the same as the 'trec' in table 2.</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-4f9e2984-4bc9-4504-9d4a-7ab6719e6f4d')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-4f9e2984-4bc9-4504-9d4a-7ab6719e6f4d button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-4f9e2984-4bc9-4504-9d4a-7ab6719e6f4d');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "<div id=\"df-9bf2943b-c010-40e0-af16-108b789bdba2\">\n",
              "  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-9bf2943b-c010-40e0-af16-108b789bdba2')\"\n",
              "            title=\"Suggest charts\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "     width=\"24px\">\n",
              "    <g>\n",
              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
              "    </g>\n",
              "</svg>\n",
              "  </button>\n",
              "\n",
              "<style>\n",
              "  .colab-df-quickchart {\n",
              "      --bg-color: #E8F0FE;\n",
              "      --fill-color: #1967D2;\n",
              "      --hover-bg-color: #E2EBFA;\n",
              "      --hover-fill-color: #174EA6;\n",
              "      --disabled-fill-color: #AAA;\n",
              "      --disabled-bg-color: #DDD;\n",
              "  }\n",
              "\n",
              "  [theme=dark] .colab-df-quickchart {\n",
              "      --bg-color: #3B4455;\n",
              "      --fill-color: #D2E3FC;\n",
              "      --hover-bg-color: #434B5C;\n",
              "      --hover-fill-color: #FFFFFF;\n",
              "      --disabled-bg-color: #3B4455;\n",
              "      --disabled-fill-color: #666;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart {\n",
              "    background-color: var(--bg-color);\n",
              "    border: none;\n",
              "    border-radius: 50%;\n",
              "    cursor: pointer;\n",
              "    display: none;\n",
              "    fill: var(--fill-color);\n",
              "    height: 32px;\n",
              "    padding: 0;\n",
              "    width: 32px;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart:hover {\n",
              "    background-color: var(--hover-bg-color);\n",
              "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "    fill: var(--button-hover-fill-color);\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart-complete:disabled,\n",
              "  .colab-df-quickchart-complete:disabled:hover {\n",
              "    background-color: var(--disabled-bg-color);\n",
              "    fill: var(--disabled-fill-color);\n",
              "    box-shadow: none;\n",
              "  }\n",
              "\n",
              "  .colab-df-spinner {\n",
              "    border: 2px solid var(--fill-color);\n",
              "    border-color: transparent;\n",
              "    border-bottom-color: var(--fill-color);\n",
              "    animation:\n",
              "      spin 1s steps(1) infinite;\n",
              "  }\n",
              "\n",
              "  @keyframes spin {\n",
              "    0% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "      border-left-color: var(--fill-color);\n",
              "    }\n",
              "    20% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    30% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    40% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    60% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    80% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "    90% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "  }\n",
              "</style>\n",
              "\n",
              "  <script>\n",
              "    async function quickchart(key) {\n",
              "      const quickchartButtonEl =\n",
              "        document.querySelector('#' + key + ' button');\n",
              "      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
              "      quickchartButtonEl.classList.add('colab-df-spinner');\n",
              "      try {\n",
              "        const charts = await google.colab.kernel.invokeFunction(\n",
              "            'suggestCharts', [key], {});\n",
              "      } catch (error) {\n",
              "        console.error('Error during call to suggestCharts:', error);\n",
              "      }\n",
              "      quickchartButtonEl.classList.remove('colab-df-spinner');\n",
              "      quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
              "    }\n",
              "    (() => {\n",
              "      let quickchartButtonEl =\n",
              "        document.querySelector('#df-9bf2943b-c010-40e0-af16-108b789bdba2 button');\n",
              "      quickchartButtonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "    })();\n",
              "  </script>\n",
              "</div>\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "text/plain": [
              "                                                                                                                                                                                                                                                                                              ANSWER  \\\n",
              "How does the average query length compare to the average document length in table 1?                                                                        The average query length is shorter in TREC (8) than it is in NTCIR (22). The average doc length is also shorter in TREC (70.9 vs 493.2)   \n",
              "What are the Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2?                                                                                                                                                                              TREC: 0.529; NTCIR: 0.340.   \n",
              "What impact do natural language explanations (NLEs) have on improving the calibration and overall effectiveness of these models in document ranking tasks?                    NLEs lead to better calibrated neural rankers while maintaining or even boosting ranking performance in most scenarios   \n",
              "How do i interpret table 2's calibration and ranking scores?                                                                                                                                                                           Lower is better for calibration, higher is better for ranking   \n",
              "What are the weights of \"Uncalibrated monoBERT\" tuned on?                                                                                                                                                                                                                                    MSMarco   \n",
              "What category was used to build and train literal explanation + BERT? what does this category mean?                                                                                                                               Category F: training nle-based neural rankers on calibration data.   \n",
              "Is the 'trec-dl' in table 1 the same as the 'trec' in table 2?                                                                                                                                                                                                                                   Yes   \n",
              "\n",
              "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    control  \\\n",
              "How does the average query length compare to the average document length in table 1?                                                                                                                                                                                                                                                                                                                                                                                                The average query length is shorter compared to the average document length in Table 1.   \n",
              "What are the Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2?                                                                                                                                                                                                                                                                                                                                                                                                                                                                0.529 and 0.534   \n",
              "What impact do natural language explanations (NLEs) have on improving the calibration and overall effectiveness of these models in document ranking tasks?  Natural language explanations (NLEs) significantly enhance the scale calibration of neural rankers, often maintaining or even boosting ranking performance in most scenarios. They provide valuable insights for document differentiation and lead to statistically significant improvements in scale calibration, resulting in lower calibration error values compared to other calibration methods. Additi...   \n",
              "How do i interpret table 2's calibration and ranking scores?                                                                                                Interpreting Table 2's calibration and ranking scores involves understanding that lower values are better for calibration metrics such as CB-ECE, ECE, and MSE. The table compares the performance of different methods on two scale calibration datasets, TREC and NTCIR, using metrics like nDCG, nDCG@10, CB-ECE, ECE, and MSE. Significant improvements over a baseline method are marked with a dagger ...   \n",
              "What are the weights of \"Uncalibrated monoBERT\" tuned on?                                                                                                                                                                                                                                                                                                                                                                                                                                                The weights of \"Uncalibrated monoBERT\" are fine-tuned on MS MARCO.   \n",
              "What category was used to build and train literal explanation + BERT? what does this category mean?                                                         Category F was used to build and train the literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. Specifically, in the scenario of the literal explanation approach, where each input is represented with two meta NLEs (one for relevance an...   \n",
              "Is the 'trec-dl' in table 1 the same as the 'trec' in table 2?                                                                                                                                                                                                                                                                                                                                                                                                                                          Yes, the 'trec-dl' in Table 1 is the same as the 'TREC' in Table 2.   \n",
              "\n",
              "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         v1  \\\n",
              "How does the average query length compare to the average document length in table 1?                                                                                                                                                                                                                                                                                                                                                                                                The average query length is shorter compared to the average document length in table 1.   \n",
              "What are the Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2?                                                                                                                                                                                                                                                                                                                             The Literal Explanation + BERT method's nDCG@10 scores on both datasets in Table 2 are 0.529 for the TREC dataset and 0.602 for the NTCIR dataset.   \n",
              "What impact do natural language explanations (NLEs) have on improving the calibration and overall effectiveness of these models in document ranking tasks?  Natural language explanations (NLEs) significantly enhance the scale calibration of neural rankers, often maintaining or even boosting ranking performance in most scenarios. They provide valuable insights for document differentiation and lead to statistically significant improvements in scale calibration, resulting in lower calibration error values compared to other calibration methods. Additi...   \n",
              "How do i interpret table 2's calibration and ranking scores?                                                                                                Interpret the calibration and ranking scores in Table 2 by noting that lower values are better for calibration metrics such as CB-ECE, ECE, and MSE. Statistically significant improvements over the \"Platt Scaling monoBERT\" baseline are marked with a dagger symbol. The table presents the performance of different methods on two scale calibration datasets, TREC and NTCIR, with metrics like nDCG, n...   \n",
              "What are the weights of \"Uncalibrated monoBERT\" tuned on?                                                                                                                                                                                                                                                                                                                                                                                                                                                The weights of \"Uncalibrated monoBERT\" are fine-tuned on MS MARCO.   \n",
              "What category was used to build and train literal explanation + BERT? what does this category mean?                                                         Category F was used to build and train literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. The approach includes generating and aggregating natural language explanations for query-document pairs, and then using these explanations to r...   \n",
              "Is the 'trec-dl' in table 1 the same as the 'trec' in table 2?                                                                                                                                                                                                                                                                                                                                                                                                                Yes, the 'trec-dl' in Table 1 is the same dataset as the 'TREC' dataset mentioned in Table 2.   \n",
              "\n",
              "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         v2  \\\n",
              "How does the average query length compare to the average document length in table 1?                                                                                                                                                                                                                                                                                                                                                                                           The average query length is significantly lower than the average document length in table 1.   \n",
              "What are the Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2?                                                                                                                                                                                                                                                                                                                            The Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2 are 0.529% for the TREC dataset and 0.340 for the NTCIR dataset.   \n",
              "What impact do natural language explanations (NLEs) have on improving the calibration and overall effectiveness of these models in document ranking tasks?  Natural language explanations (NLEs) significantly enhance the scale calibration of neural rankers, often maintaining or even boosting ranking performance in most scenarios. They provide valuable insights for document differentiation and lead to statistically significant improvements in scale calibration, resulting in lower calibration error values compared to other calibration methods. Additi...   \n",
              "How do i interpret table 2's calibration and ranking scores?                                                                                                Interpret the calibration and ranking scores in Table 2 by noting that lower values are better for calibration metrics such as CB-ECE, ECE, and MSE. Look for statistically significant improvements over the \"Platt Scaling monoBERT\" baseline, which are marked with a dagger symbol (\u2020). Pay attention to the values in the columns for Ranking and Calibration metrics for different methods and dataset...   \n",
              "What are the weights of \"Uncalibrated monoBERT\" tuned on?                                                                                                                                                                                                                                                                                                                                                                                                                                                The weights of \"Uncalibrated monoBERT\" are fine-tuned on MS MARCO.   \n",
              "What category was used to build and train literal explanation + BERT? what does this category mean?                                                         Category F was used to build and train literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. For the literal explanation approach, this category involves generating and aggregating natural language explanations for query-document pairs ...   \n",
              "Is the 'trec-dl' in table 1 the same as the 'trec' in table 2?                                                                                                                                                                                                                                                                                                                                                                                                                                          Yes, the 'trec-dl' in table 1 is the same as the 'TREC' in table 2.   \n",
              "\n",
              "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         v3  \\\n",
              "How does the average query length compare to the average document length in table 1?                                                                                                                                                                                                                                                                                                                                                                                                       The average query length is shorter than the average document length in Table 1.   \n",
              "What are the Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2?                                                                                                                                                                                                                                                                                                                                   The Literal Explanation + BERT method's ndcg@10 scores on the TREC dataset and NTCIR dataset in table 2 are 0.529% and 0.340%, respectively.   \n",
              "What impact do natural language explanations (NLEs) have on improving the calibration and overall effectiveness of these models in document ranking tasks?  Natural language explanations (NLEs) significantly enhance the scale calibration of neural rankers, often maintaining or even boosting ranking performance in most scenarios. They provide valuable insights for document differentiation and lead to statistically significant improvements in scale calibration, exhibiting lower error values compared to other calibration methods. Additionally, NLEs c...   \n",
              "How do i interpret table 2's calibration and ranking scores?                                                                                                The calibration scores in Table 2, represented by CB-ECE, ECE, and MSE, are metrics used to evaluate the alignment of the ranking scores produced by different methods with the target scale. Lower values for these metrics indicate better calibration performance. On the other hand, the ranking scores, such as nDCG and nDCG@10, measure the effectiveness of the ranking produced by the methods, whe...   \n",
              "What are the weights of \"Uncalibrated monoBERT\" tuned on?                                                                                                                                                                                                                                                                                                                                                                                                                                                The weights of \"Uncalibrated monoBERT\" are fine-tuned on MS MARCO.   \n",
              "What category was used to build and train literal explanation + BERT? what does this category mean?                                                                                                                                                               Category F was used to build and train the literal explanation + BERT model. This category involves training NLE-based neural rankers on calibration data. In this approach, a BERT model is finetuned to process meta natural language explanations (NLEs) and generate scale-calibrated ranking scores.   \n",
              "Is the 'trec-dl' in table 1 the same as the 'trec' in table 2?                                                                                                                                                                                                                                                                                                                                                                                                                                           The 'trec-dl' in Table 1 is not the same as the 'trec' in Table 2.   \n",
              "\n",
              "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         v4  \n",
              "How does the average query length compare to the average document length in table 1?                                                                                                                                                                                                                                                                                                                                                                                           The average query length is significantly lower than the average document length in table 1.  \n",
              "What are the Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2?                                                                                                                                                                                                                                                                                                                                                         The Literal Explanation + BERT method's nDCG@10 score on the TREC dataset is 0.529% and on the NTCIR dataset is 0.340.  \n",
              "What impact do natural language explanations (NLEs) have on improving the calibration and overall effectiveness of these models in document ranking tasks?  Natural language explanations (NLEs) significantly enhance the scale calibration of neural rankers, often maintaining or even boosting ranking performance in most scenarios. They provide valuable insights for document differentiation and lead to statistically significant improvements in scale calibration, exhibiting lower error values compared to other calibration methods. Additionally, NLEs h...  \n",
              "How do i interpret table 2's calibration and ranking scores?                                                                                                Interpret the calibration and ranking scores in Table 2 by noting that lower values are better for calibration metrics such as CB-ECE, ECE, and MSE. Look for statistically significant improvements over the \"Platt Scaling monoBERT\" baseline, which are marked with a dagger symbol (\u2020). Pay attention to the values in the columns for Ranking and Calibration metrics for different methods and dataset...  \n",
              "What are the weights of \"Uncalibrated monoBERT\" tuned on?                                                                                                                                                                                                                                                                                                                                                                                                                                                The weights of \"Uncalibrated monoBERT\" are fine-tuned on MS MARCO.  \n",
              "What category was used to build and train literal explanation + BERT? what does this category mean?                                                         Category F was used to build and train the \"Literal Explanation + BERT\" method. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and generate scale-calibrated ranking scores. The literal explanation approach utilizes natural language explanations for query-document pairs to enhance the ranking performan...  \n",
              "Is the 'trec-dl' in table 1 the same as the 'trec' in table 2?                                                                                                                                                                                                                                                                                                                                                                                                                                       No, the 'trec-dl' in table 1 is not the same as the 'trec' in table 2.  "
            ]
          },
          "execution_count": 115,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# Add variant 3's responses to `exp_results` dataframe:\n",
        "v3_responses = [one_v3, two_v3, three_v3, four_v3, five_v3, six_v3, seven_v3]\n",
        "\n",
        "exp_results['v3'] = v3_responses\n",
        "\n",
        "exp_results"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "HyqF3J2DobEY"
      },
      "source": [
        "## Variant 4: Natural language injection\n",
        "\n",
        "For this variant, you will inject the structured data values from your table into a phrase (or sentence) and vectorize that."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 84,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 238
        },
        "id": "qf5-LXj-YBYE",
        "outputId": "b9008321-06f4-45c2-cee9-980190ba20b7"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.google.colaboratory.intrinsic+json": {
              "summary": "{\n  \"name\": \"df1\",\n  \"rows\": 6,\n  \"fields\": [\n    {\n      \"column\": \"Metric\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 6,\n        \"samples\": [\n          \"# Queries (Train/Val/Test)\",\n          \"Avg. # docs per query\",\n          \"Avg. doc. length\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"TREC-DL\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 6,\n        \"samples\": [\n          \"| 97/53/67\",\n          \"282.7\",\n          \"70.9\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"NTCIR-14\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 6,\n        \"samples\": [\n          \"48/16/16\",\n          \"345.3\",\n          \"493.2\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}",
              "type": "dataframe",
              "variable_name": "df1"
            },
            "text/html": [
              "\n",
              "  <div id=\"df-2f6536f5-a952-4d21-bb5b-549e2f291622\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>Metric</th>\n",
              "      <th>TREC-DL</th>\n",
              "      <th>NTCIR-14</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td># Queries (Train/Val/Test)</td>\n",
              "      <td>| 97/53/67</td>\n",
              "      <td>48/16/16</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>Avg. # docs per query</td>\n",
              "      <td>282.7</td>\n",
              "      <td>345.3</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>Levels of relevance</td>\n",
              "      <td>4</td>\n",
              "      <td>5</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>Label dist. (low to high)</td>\n",
              "      <td>58/22/14/6</td>\n",
              "      <td>48/23/17/8/3</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>Avg. query length</td>\n",
              "      <td>8.0</td>\n",
              "      <td>22.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>5</th>\n",
              "      <td>Avg. doc. length</td>\n",
              "      <td>70.9</td>\n",
              "      <td>493.2</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-2f6536f5-a952-4d21-bb5b-549e2f291622')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-2f6536f5-a952-4d21-bb5b-549e2f291622 button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-2f6536f5-a952-4d21-bb5b-549e2f291622');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "<div id=\"df-3c4e6d8d-d95d-4854-8552-07b5a58202f0\">\n",
              "  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-3c4e6d8d-d95d-4854-8552-07b5a58202f0')\"\n",
              "            title=\"Suggest charts\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "     width=\"24px\">\n",
              "    <g>\n",
              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
              "    </g>\n",
              "</svg>\n",
              "  </button>\n",
              "\n",
              "<style>\n",
              "  .colab-df-quickchart {\n",
              "      --bg-color: #E8F0FE;\n",
              "      --fill-color: #1967D2;\n",
              "      --hover-bg-color: #E2EBFA;\n",
              "      --hover-fill-color: #174EA6;\n",
              "      --disabled-fill-color: #AAA;\n",
              "      --disabled-bg-color: #DDD;\n",
              "  }\n",
              "\n",
              "  [theme=dark] .colab-df-quickchart {\n",
              "      --bg-color: #3B4455;\n",
              "      --fill-color: #D2E3FC;\n",
              "      --hover-bg-color: #434B5C;\n",
              "      --hover-fill-color: #FFFFFF;\n",
              "      --disabled-bg-color: #3B4455;\n",
              "      --disabled-fill-color: #666;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart {\n",
              "    background-color: var(--bg-color);\n",
              "    border: none;\n",
              "    border-radius: 50%;\n",
              "    cursor: pointer;\n",
              "    display: none;\n",
              "    fill: var(--fill-color);\n",
              "    height: 32px;\n",
              "    padding: 0;\n",
              "    width: 32px;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart:hover {\n",
              "    background-color: var(--hover-bg-color);\n",
              "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "    fill: var(--button-hover-fill-color);\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart-complete:disabled,\n",
              "  .colab-df-quickchart-complete:disabled:hover {\n",
              "    background-color: var(--disabled-bg-color);\n",
              "    fill: var(--disabled-fill-color);\n",
              "    box-shadow: none;\n",
              "  }\n",
              "\n",
              "  .colab-df-spinner {\n",
              "    border: 2px solid var(--fill-color);\n",
              "    border-color: transparent;\n",
              "    border-bottom-color: var(--fill-color);\n",
              "    animation:\n",
              "      spin 1s steps(1) infinite;\n",
              "  }\n",
              "\n",
              "  @keyframes spin {\n",
              "    0% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "      border-left-color: var(--fill-color);\n",
              "    }\n",
              "    20% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    30% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    40% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    60% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    80% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "    90% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "  }\n",
              "</style>\n",
              "\n",
              "  <script>\n",
              "    async function quickchart(key) {\n",
              "      const quickchartButtonEl =\n",
              "        document.querySelector('#' + key + ' button');\n",
              "      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
              "      quickchartButtonEl.classList.add('colab-df-spinner');\n",
              "      try {\n",
              "        const charts = await google.colab.kernel.invokeFunction(\n",
              "            'suggestCharts', [key], {});\n",
              "      } catch (error) {\n",
              "        console.error('Error during call to suggestCharts:', error);\n",
              "      }\n",
              "      quickchartButtonEl.classList.remove('colab-df-spinner');\n",
              "      quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
              "    }\n",
              "    (() => {\n",
              "      let quickchartButtonEl =\n",
              "        document.querySelector('#df-3c4e6d8d-d95d-4854-8552-07b5a58202f0 button');\n",
              "      quickchartButtonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "    })();\n",
              "  </script>\n",
              "</div>\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "text/plain": [
              "                       Metric     TREC-DL      NTCIR-14\n",
              "0  # Queries (Train/Val/Test)  | 97/53/67      48/16/16\n",
              "1       Avg. # docs per query       282.7         345.3\n",
              "2         Levels of relevance           4             5\n",
              "3   Label dist. (low to high)  58/22/14/6  48/23/17/8/3\n",
              "4           Avg. query length         8.0          22.0\n",
              "5            Avg. doc. length        70.9         493.2"
            ]
          },
          "execution_count": 84,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# Reminder of what df1 looks like\n",
        "df1"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 85,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "IkXJ568gYBkT",
        "outputId": "dda22de5-8ea2-49be-fb12-1a54c6af8bf2"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# Write func that comes up with a natural language phrase/sentence/paragraph that makes Table 1's data make sense.\n",
        "# You could do also this via LLM if scaling this workflow is important.\n",
        "\n",
        "def inject_df1_vals_into_template(dataframe: pd.DataFrame) -> list[str]:\n",
        "    \"\"\"\n",
        "    Inject values from Table 1 dataframe (`df1`) into a natural-language template.\n",
        "\n",
        "    :param dataframe: Dataframe representing Table 1 from PDF.\n",
        "    :return: Populated natural-language templates.\n",
        "    \"\"\"\n",
        "    filled_templates = []\n",
        "    for i,v in dataframe.iterrows():\n",
        "        metric_name = v[0]\n",
        "        val_1 = v[1]\n",
        "        val_2 = v[2]\n",
        "        dataset_1 = dataframe.columns[1]\n",
        "        dataset_2 = dataframe.columns[2]\n",
        "        template = f\"The {metric_name} in the {dataset_1} dataset is {val_1}, while it\\'s {val_2} in the {dataset_2} dataset\"\n",
        "        filled_templates.append(template)\n",
        "    return filled_templates\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 86,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 231
        },
        "id": "oIySv4xRYBqR",
        "outputId": "941fe256-bcac-45ec-c29d-3dc1ea9b7634"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "<ipython-input-85-ae18fa5bcf96>:13: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
            "  metric_name = v[0]\n",
            "<ipython-input-85-ae18fa5bcf96>:14: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
            "  val_1 = v[1]\n",
            "<ipython-input-85-ae18fa5bcf96>:15: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
            "  val_2 = v[2]\n"
          ]
        }
      ],
      "source": [
        "df1_filled_templates = inject_df1_vals_into_template(df1)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 87,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "id": "RvZis-0K4MNh",
        "outputId": "3ac85122-8f67-4d5f-eb1f-e9a36f1c044b"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            },
            "text/plain": [
              "\"The # Queries (Train/Val/Test) in the TREC-DL dataset is | 97/53/67, while it's 48/16/16 in the NTCIR-14 dataset\""
            ]
          },
          "execution_count": 87,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# Check out one of the phrases you made:\n",
        "df1_filled_templates[0]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 88,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 474
        },
        "id": "XpH4y7NFwDc6",
        "outputId": "e3c9a8dd-aa6d-46e5-fe12-4b558d085f9e"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.google.colaboratory.intrinsic+json": {
              "summary": "{\n  \"name\": \"df2\",\n  \"rows\": 8,\n  \"fields\": [\n    {\n      \"column\": \"Method\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 8,\n        \"samples\": [\n          \"Post hoc + monoBERT\",\n          \"Post hoc + MC Sampling LLM\",\n          \"Uncalibrated monoBERT\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"Category\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 6,\n        \"samples\": [\n          \"A\",\n          \"B\",\n          \"F\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"TREC-ndcg\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 7,\n        \"samples\": [\n          \"0.799\",\n          \"0.776\",\n          \"0.815*\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"TREC-ndcg@10\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 7,\n        \"samples\": [\n          \"0.494\",\n          \"0.422\",\n          \"0.529%\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"TREC-CB-ECE\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 8,\n        \"samples\": [\n          \"1.141\",\n          \"1.165\",\n          \"1.205\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"TREC-ECE\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 8,\n        \"samples\": [\n          \"0.125\",\n          \"0.145\",\n          \"0.320\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"TREC-MSE\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 8,\n        \"samples\": [\n          \"0.684\",\n          \"0.673\",\n          \"0.773\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"NTCIR-ndcg\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 7,\n        \"samples\": [\n          \"| 0.735\",\n          \"0.696\",\n          \"0.742\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"NTCIR-ndcg@10\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 7,\n        \"samples\": [\n          \"0.337\",\n          \"0.268\",\n          \"0.340\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"NTCIR-CB-ECE\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 8,\n        \"samples\": [\n          \"1.624\",\n          \"1.677\",\n          \"1.757\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"NTCIR-ECE\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 8,\n        \"samples\": [\n          \"0.457\",\n          \"0.472\",\n          \"0.799\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"NTCIR-MSE\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 8,\n        \"samples\": [\n          \"1.462\",\n          \"\\u20181.540\",\n          \"1.824\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}",
              "type": "dataframe",
              "variable_name": "df2"
            },
            "text/html": [
              "\n",
              "  <div id=\"df-68ee2418-b63a-4419-8f9a-65cee86538ba\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>Method</th>\n",
              "      <th>Category</th>\n",
              "      <th>TREC-ndcg</th>\n",
              "      <th>TREC-ndcg@10</th>\n",
              "      <th>TREC-CB-ECE</th>\n",
              "      <th>TREC-ECE</th>\n",
              "      <th>TREC-MSE</th>\n",
              "      <th>NTCIR-ndcg</th>\n",
              "      <th>NTCIR-ndcg@10</th>\n",
              "      <th>NTCIR-CB-ECE</th>\n",
              "      <th>NTCIR-ECE</th>\n",
              "      <th>NTCIR-MSE</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>Uncalibrated monoBERT</td>\n",
              "      <td>A</td>\n",
              "      <td>0.799</td>\n",
              "      <td>0.494</td>\n",
              "      <td>1.205</td>\n",
              "      <td>0.320</td>\n",
              "      <td>0.773</td>\n",
              "      <td>| 0.735</td>\n",
              "      <td>0.337</td>\n",
              "      <td>1.757</td>\n",
              "      <td>0.799</td>\n",
              "      <td>1.824</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>Post hoc + monoBERT</td>\n",
              "      <td>B</td>\n",
              "      <td>0.799</td>\n",
              "      <td>0.494</td>\n",
              "      <td>1.141</td>\n",
              "      <td>0.125</td>\n",
              "      <td>0.684</td>\n",
              "      <td>| 0.735</td>\n",
              "      <td>0.337</td>\n",
              "      <td>1.624</td>\n",
              "      <td>0.457</td>\n",
              "      <td>1.462</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>Finetune monoBERT</td>\n",
              "      <td>C</td>\n",
              "      <td>0.776</td>\n",
              "      <td>0.422</td>\n",
              "      <td>1.093</td>\n",
              "      <td>0.221</td>\n",
              "      <td>-~\u2014\u00ab0.721 |</td>\n",
              "      <td>0.696</td>\n",
              "      <td>0.268</td>\n",
              "      <td>1.843</td>\n",
              "      <td>0.709</td>\n",
              "      <td>\u20181.874</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>Finetune BERT</td>\n",
              "      <td>C</td>\n",
              "      <td>0.738</td>\n",
              "      <td>0.327</td>\n",
              "      <td>1.253</td>\n",
              "      <td>0.266</td>\n",
              "      <td>~=\u20140.785_ |</td>\n",
              "      <td>_ 0.727</td>\n",
              "      <td>0.285</td>\n",
              "      <td>1.756</td>\n",
              "      <td>0.546</td>\n",
              "      <td>\u00ab1.416</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>LLM prompting w/ rubrics</td>\n",
              "      <td>D</td>\n",
              "      <td>0.786</td>\n",
              "      <td>0.457</td>\n",
              "      <td>1.000'</td>\n",
              "      <td>1.246</td>\n",
              "      <td>\u00ab2.137</td>\n",
              "      <td>| 0.728</td>\n",
              "      <td>0.328</td>\n",
              "      <td>1.2947</td>\n",
              "      <td>1.194</td>\n",
              "      <td>2.773</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>5</th>\n",
              "      <td>Post hoc + MC Sampling LLM</td>\n",
              "      <td>E</td>\n",
              "      <td>0.790</td>\n",
              "      <td>0.473</td>\n",
              "      <td>1.165</td>\n",
              "      <td>0.145</td>\n",
              "      <td>0.673</td>\n",
              "      <td>| 0.736</td>\n",
              "      <td>0.364\"</td>\n",
              "      <td>1.677</td>\n",
              "      <td>0.472</td>\n",
              "      <td>\u20181.540</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>6</th>\n",
              "      <td>Literal Explanation + BERT</td>\n",
              "      <td>F</td>\n",
              "      <td>0.815*</td>\n",
              "      <td>0.529%</td>\n",
              "      <td>0.996\u00b0</td>\n",
              "      <td>0.067*</td>\n",
              "      <td>0.602\" |</td>\n",
              "      <td>0.742</td>\n",
              "      <td>0.340</td>\n",
              "      <td>1.534\"</td>\n",
              "      <td>0.355</td>\n",
              "      <td>1.3307</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>7</th>\n",
              "      <td>Conditional Explanation + BERT</td>\n",
              "      <td>F</td>\n",
              "      <td>0.822</td>\n",
              "      <td>\u20140.534*</td>\n",
              "      <td>0.862'</td>\n",
              "      <td>0.428</td>\n",
              "      <td>~\u20140.832_ |</td>\n",
              "      <td>0.720</td>\n",
              "      <td>0.322</td>\n",
              "      <td>1.405'</td>\n",
              "      <td>0.2577</td>\n",
              "      <td>1.2907</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-68ee2418-b63a-4419-8f9a-65cee86538ba')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-68ee2418-b63a-4419-8f9a-65cee86538ba button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-68ee2418-b63a-4419-8f9a-65cee86538ba');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "<div id=\"df-00fa6fb1-968b-4f7d-8c96-c36fd6240fcc\">\n",
              "  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-00fa6fb1-968b-4f7d-8c96-c36fd6240fcc')\"\n",
              "            title=\"Suggest charts\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "     width=\"24px\">\n",
              "    <g>\n",
              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
              "    </g>\n",
              "</svg>\n",
              "  </button>\n",
              "\n",
              "<style>\n",
              "  .colab-df-quickchart {\n",
              "      --bg-color: #E8F0FE;\n",
              "      --fill-color: #1967D2;\n",
              "      --hover-bg-color: #E2EBFA;\n",
              "      --hover-fill-color: #174EA6;\n",
              "      --disabled-fill-color: #AAA;\n",
              "      --disabled-bg-color: #DDD;\n",
              "  }\n",
              "\n",
              "  [theme=dark] .colab-df-quickchart {\n",
              "      --bg-color: #3B4455;\n",
              "      --fill-color: #D2E3FC;\n",
              "      --hover-bg-color: #434B5C;\n",
              "      --hover-fill-color: #FFFFFF;\n",
              "      --disabled-bg-color: #3B4455;\n",
              "      --disabled-fill-color: #666;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart {\n",
              "    background-color: var(--bg-color);\n",
              "    border: none;\n",
              "    border-radius: 50%;\n",
              "    cursor: pointer;\n",
              "    display: none;\n",
              "    fill: var(--fill-color);\n",
              "    height: 32px;\n",
              "    padding: 0;\n",
              "    width: 32px;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart:hover {\n",
              "    background-color: var(--hover-bg-color);\n",
              "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "    fill: var(--button-hover-fill-color);\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart-complete:disabled,\n",
              "  .colab-df-quickchart-complete:disabled:hover {\n",
              "    background-color: var(--disabled-bg-color);\n",
              "    fill: var(--disabled-fill-color);\n",
              "    box-shadow: none;\n",
              "  }\n",
              "\n",
              "  .colab-df-spinner {\n",
              "    border: 2px solid var(--fill-color);\n",
              "    border-color: transparent;\n",
              "    border-bottom-color: var(--fill-color);\n",
              "    animation:\n",
              "      spin 1s steps(1) infinite;\n",
              "  }\n",
              "\n",
              "  @keyframes spin {\n",
              "    0% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "      border-left-color: var(--fill-color);\n",
              "    }\n",
              "    20% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    30% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    40% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    60% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    80% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "    90% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "  }\n",
              "</style>\n",
              "\n",
              "  <script>\n",
              "    async function quickchart(key) {\n",
              "      const quickchartButtonEl =\n",
              "        document.querySelector('#' + key + ' button');\n",
              "      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
              "      quickchartButtonEl.classList.add('colab-df-spinner');\n",
              "      try {\n",
              "        const charts = await google.colab.kernel.invokeFunction(\n",
              "            'suggestCharts', [key], {});\n",
              "      } catch (error) {\n",
              "        console.error('Error during call to suggestCharts:', error);\n",
              "      }\n",
              "      quickchartButtonEl.classList.remove('colab-df-spinner');\n",
              "      quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
              "    }\n",
              "    (() => {\n",
              "      let quickchartButtonEl =\n",
              "        document.querySelector('#df-00fa6fb1-968b-4f7d-8c96-c36fd6240fcc button');\n",
              "      quickchartButtonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "    })();\n",
              "  </script>\n",
              "</div>\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "text/plain": [
              "                           Method Category TREC-ndcg TREC-ndcg@10 TREC-CB-ECE  \\\n",
              "0           Uncalibrated monoBERT        A     0.799        0.494       1.205   \n",
              "1             Post hoc + monoBERT        B     0.799        0.494       1.141   \n",
              "2               Finetune monoBERT        C     0.776        0.422       1.093   \n",
              "3                   Finetune BERT        C     0.738        0.327       1.253   \n",
              "4        LLM prompting w/ rubrics        D     0.786        0.457      1.000'   \n",
              "5      Post hoc + MC Sampling LLM        E     0.790        0.473       1.165   \n",
              "6      Literal Explanation + BERT        F    0.815*       0.529%      0.996\u00b0   \n",
              "7  Conditional Explanation + BERT        F     0.822      \u20140.534*      0.862'   \n",
              "\n",
              "  TREC-ECE     TREC-MSE NTCIR-ndcg NTCIR-ndcg@10 NTCIR-CB-ECE NTCIR-ECE  \\\n",
              "0    0.320        0.773    | 0.735         0.337        1.757     0.799   \n",
              "1    0.125        0.684    | 0.735         0.337        1.624     0.457   \n",
              "2    0.221  -~\u2014\u00ab0.721 |      0.696         0.268        1.843     0.709   \n",
              "3    0.266  ~=\u20140.785_ |    _ 0.727         0.285        1.756     0.546   \n",
              "4    1.246       \u00ab2.137    | 0.728         0.328       1.2947     1.194   \n",
              "5    0.145        0.673    | 0.736        0.364\"        1.677     0.472   \n",
              "6   0.067*     0.602\" |      0.742         0.340       1.534\"     0.355   \n",
              "7    0.428   ~\u20140.832_ |      0.720         0.322       1.405'    0.2577   \n",
              "\n",
              "  NTCIR-MSE  \n",
              "0     1.824  \n",
              "1     1.462  \n",
              "2    \u20181.874  \n",
              "3    \u00ab1.416  \n",
              "4     2.773  \n",
              "5    \u20181.540  \n",
              "6    1.3307  \n",
              "7    1.2907  "
            ]
          },
          "execution_count": 88,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# Do the same with df_2\n",
        "# # Reminder of what df2 looks like\n",
        "df2"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 89,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "GwbYYsJyb9GE",
        "outputId": "3c425b1d-9d81-49f8-f5bf-644b4a740bdc"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "def inject_df2_vals_into_template(dataframe: pd.DataFrame) -> list[str]:\n",
        "    \"\"\"\n",
        "    Inject values from Table 2 dataframe (`df1`) into a natural-language template.\n",
        "\n",
        "    :param dataframe: Dataframe representing Table 2 from PDF.\n",
        "    :return: Populated natural-language templates.\n",
        "    \"\"\"\n",
        "    template = (\"Against the {dataset} dataset, the \\\"{method}\\\" method (from the \\\"{category}\\\" category) got a nDCG score of {nDCG}, \"\n",
        "                \"a nDCG@10 score of {nDCG10}, a CB-ECE score of {CB_ECE}, \"\n",
        "                \"an ECE score of {ECE}, and an MSE score of {MSE}.\")\n",
        "\n",
        "    paragraphs = []  # To store the final paragraphs\n",
        "\n",
        "    # Iterate over the dataframe\n",
        "    for index, row in dataframe.iterrows():\n",
        "        method = row['Method']\n",
        "        category = row['Category']\n",
        "        trec_values = {col.split('-', 1)[1]: val for col, val in row.items() if col.startswith('TREC')}\n",
        "        ntcir_values = {col.split('-', 1)[1]: val for col, val in row.items() if col.startswith('NTCIR')}\n",
        "\n",
        "        # Inject the values into the template for TREC\n",
        "        trec_paragraph = template.format(\n",
        "            dataset=\"TREC\",\n",
        "            method=method,\n",
        "            category=category,\n",
        "            nDCG=trec_values.get('ndcg', 'N/A'),\n",
        "            nDCG10=trec_values.get('ndcg@10', 'N/A'),\n",
        "            CB_ECE=trec_values.get('CB-ECE', 'N/A'),\n",
        "            ECE=trec_values.get('ECE', 'N/A'),\n",
        "            MSE=trec_values.get('MSE', 'N/A')\n",
        "        )\n",
        "\n",
        "        # Inject the values into the template for NTCIR\n",
        "        ntcir_paragraph = template.format(\n",
        "            dataset=\"NTCIR\",\n",
        "            method=method,\n",
        "            category=category,\n",
        "            nDCG=ntcir_values.get('ndcg', 'N/A'),\n",
        "            nDCG10=ntcir_values.get('ndcg@10', 'N/A'),\n",
        "            CB_ECE=ntcir_values.get('CB-ECE', 'N/A'),\n",
        "            ECE=ntcir_values.get('ECE', 'N/A'),\n",
        "            MSE=ntcir_values.get('MSE', 'N/A')\n",
        "        )\n",
        "\n",
        "        # Combine the TREC and NTCIR paragraphs\n",
        "        combined_paragraph = trec_paragraph + \" \" + ntcir_paragraph\n",
        "        paragraphs.append(combined_paragraph)\n",
        "\n",
        "    return paragraphs\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 90,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "5sOg_k2Vfu6M",
        "outputId": "90bf1ec3-3e76-4b9f-be08-59571c00bc77"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "df2_filled_templates = inject_df2_vals_into_template(df2)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 91,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 89
        },
        "id": "Kn1n4jpb40Vb",
        "outputId": "bbff9bb6-b52e-4582-b70f-d062ebb0b8db"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            },
            "text/plain": [
              "'Against the TREC dataset, the \"Uncalibrated monoBERT\" method (from the \"A\" category) got a nDCG score of 0.799, a nDCG@10 score of 0.494, a CB-ECE score of 1.205, an ECE score of 0.320, and an MSE score of 0.773. Against the NTCIR dataset, the \"Uncalibrated monoBERT\" method (from the \"A\" category) got a nDCG score of | 0.735, a nDCG@10 score of 0.337, a CB-ECE score of 1.757, an ECE score of 0.799, and an MSE score of 1.824.'"
            ]
          },
          "execution_count": 91,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# Preview\n",
        "df2_filled_templates[0]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 92,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 89
        },
        "id": "_GhtZ9TFSdTb",
        "outputId": "9f56b758-ad04-43bb-f6bf-379423d4a22b"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            },
            "text/plain": [
              "'Against the TREC dataset, the \"Literal Explanation + BERT\" method (from the \"F\" category) got a nDCG score of 0.815*, a nDCG@10 score of 0.529%, a CB-ECE score of 0.996\u00b0, an ECE score of 0.067*, and an MSE score of 0.602\" |. Against the NTCIR dataset, the \"Literal Explanation + BERT\" method (from the \"F\" category) got a nDCG score of 0.742, a nDCG@10 score of 0.340, a CB-ECE score of 1.534\", an ECE score of 0.355, and an MSE score of 1.3307.'"
            ]
          },
          "execution_count": 92,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "df2_filled_templates[-2]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 93,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "V_IkN4lwfw1Z",
        "outputId": "ba1cb86d-f6e0-4bcc-d5de-550bd1c7f690"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# Now turn both lists of filled-in templates into LlamaIndex Document objects:\n",
        "df1_filled_templates_docs = turn_data_into_documents(df1_filled_templates)\n",
        "df2_filled_templates_docs = turn_data_into_documents(df2_filled_templates)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Y5Ukh_ac47HL"
      },
      "source": [
        "### Run indexing pipeline"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 94,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 49,
          "referenced_widgets": [
            "b3e16e2ee79a41818437fd2e45a10ab6",
            "08bcbbb1aaf64e61bcf641162592245f",
            "838ef7beab4848beb9426d0dd127ba29",
            "8fd8f8c9813d4c2ab99a77c7862a97f2",
            "6804bc883e03469ebd2687fffe4d2d9e",
            "542f183046f14799825b8df25ec98ce9",
            "8b0a3c305285445d97f053ea9937074e",
            "2d0003b57e0d489f8f76cad0ebda4145",
            "c4649f99409c4bf4b4fa9184f3519aed",
            "0fe4606f93994f96bbccf0c176c45a29",
            "cf2566c4889a4fb78e5ae87e92c1ba10"
          ]
        },
        "id": "S8CE5_OnkDv-",
        "outputId": "4e64fda8-5588-4930-f7a1-308682ee2fd7"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "b3e16e2ee79a41818437fd2e45a10ab6",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "Upserted vectors:   0%|          | 0/63 [00:00<?, ?it/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# Declare new namespace\n",
        "v4_namespace = 'v4'\n",
        "\n",
        "# Initialize vector store w/v4 namespace\n",
        "v4_vector_store = initialize_vector_store(pinecone_index, v4_namespace)\n",
        "\n",
        "# Join docs\n",
        "v4_docs = df1_filled_templates_docs + df2_filled_templates_docs + ctrl_docs\n",
        "\n",
        "# Run through embedding and indexing pipeline\n",
        "output = run_indexing_pipeline(v4_vector_store, v4_docs)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 95,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 160
        },
        "id": "4s-gPgzll1_p",
        "outputId": "221ab7c8-3b8f-4eab-f077-99651171d5ae"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "text/plain": [
              "{'dimension': 1536,\n",
              " 'index_fullness': 0.0,\n",
              " 'namespaces': {'control': {'vector_count': 48},\n",
              "                'v1': {'vector_count': 63},\n",
              "                'v2': {'vector_count': 63},\n",
              "                'v3': {'vector_count': 76},\n",
              "                'v4': {'vector_count': 0}},\n",
              " 'total_vector_count': 250}"
            ]
          },
          "execution_count": 95,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "pinecone_index.describe_index_stats()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "TDu6-kes5VDC"
      },
      "source": [
        "### Run RAG pipeline"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 111,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "K4v4S_Pf5aBP",
        "outputId": "9e27cec9-f335-4217-bf83-08ffbe1bf0af"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# Run RAG pipeline for v4 use case\n",
        "one_v4, two_v4, three_v4, four_v4, five_v4, six_v4, seven_v4 = run_rag_pipeline(v4_vector_store, QUERIES)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 113,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000
        },
        "id": "8JTL4ztf5aBZ",
        "outputId": "06b7729a-4c87-45ea-b258-b907ab480624"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.google.colaboratory.intrinsic+json": {
              "summary": "{\n  \"name\": \"exp_results\",\n  \"rows\": 7,\n  \"fields\": [\n    {\n      \"column\": \"ANSWER\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 7,\n        \"samples\": [\n          \"The average query length is shorter in TREC (8) than it is in NTCIR (22). The average doc length is also shorter in TREC (70.9 vs 493.2)\",\n          \"TREC: 0.529; NTCIR: 0.340.\",\n          \"Category F: training nle-based neural rankers on calibration data.\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"control\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 7,\n        \"samples\": [\n          \"The average query length is shorter compared to the average document length in Table 1.\",\n          \"0.529 and 0.534\",\n          \"Category F was used to build and train the literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. Specifically, in the scenario of the literal explanation approach, where each input is represented with two meta NLEs (one for relevance and one for non-relevance), an additional processing step is involved. The hidden states obtained from encoding both NLEs are concatenated, and this concatenated representation is fed into an additional linear layer to transform these combined hidden states into a final ranking score.\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"v1\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 7,\n        \"samples\": [\n          \"The average query length is shorter compared to the average document length in table 1.\",\n          \"The Literal Explanation + BERT method's nDCG@10 scores on both datasets in Table 2 are 0.529 for the TREC dataset and 0.602 for the NTCIR dataset.\",\n          \"Category F was used to build and train literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. The approach includes generating and aggregating natural language explanations for query-document pairs, and then using these explanations to refine the ranking scores of the neural ranker.\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"v2\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 7,\n        \"samples\": [\n          \"The average query length is significantly lower than the average document length in table 1.\",\n          \"The Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2 are 0.529% for the TREC dataset and 0.340 for the NTCIR dataset.\",\n          \"Category F was used to build and train literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. For the literal explanation approach, this category involves generating and aggregating natural language explanations for query-document pairs to enhance the ranking performance of neural rankers.\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"v3\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 7,\n        \"samples\": [\n          \"The average query length is shorter than the average document length in Table 1.\",\n          \"The Literal Explanation + BERT method's ndcg@10 scores on the TREC dataset and the NTCIR dataset in table 2 are 0.529% and 0.340%, respectively.\",\n          \"Category F was used to build and train the literal explanation + BERT model. This category involves training NLE-based neural rankers on calibration data. In this context, it means that the model was fine-tuned to process meta natural language explanations (NLEs) for query-document pairs and generate scale-calibrated ranking scores.\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"v4\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 7,\n        \"samples\": [\n          \"The average query length is significantly lower than the average document length in table 1.\",\n          \"The Literal Explanation + BERT method's nDCG@10 score on the TREC dataset is 0.529% and on the NTCIR dataset is 0.340.\",\n          \"Category F was used to build and train the \\\"Literal Explanation + BERT\\\" method. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and generate scale-calibrated ranking scores. The literal explanation approach utilizes natural language explanations for query-document pairs to enhance the ranking performance of neural rankers.\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}",
              "type": "dataframe",
              "variable_name": "exp_results"
            },
            "text/html": [
              "\n",
              "  <div id=\"df-7391e708-4337-435c-8749-64206a2f8d63\" class=\"colab-df-container\">\n",
              "    <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>ANSWER</th>\n",
              "      <th>control</th>\n",
              "      <th>v1</th>\n",
              "      <th>v2</th>\n",
              "      <th>v3</th>\n",
              "      <th>v4</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>How does the average query length compare to the average document length in table 1?</th>\n",
              "      <td>The average query length is shorter in TREC (8) than it is in NTCIR (22). The average doc length is also shorter in TREC (70.9 vs 493.2)</td>\n",
              "      <td>The average query length is shorter compared to the average document length in Table 1.</td>\n",
              "      <td>The average query length is shorter compared to the average document length in table 1.</td>\n",
              "      <td>The average query length is significantly lower than the average document length in table 1.</td>\n",
              "      <td>The average query length is shorter than the average document length in Table 1.</td>\n",
              "      <td>The average query length is significantly lower than the average document length in table 1.</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>What are the Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2?</th>\n",
              "      <td>TREC: 0.529; NTCIR: 0.340.</td>\n",
              "      <td>0.529 and 0.534</td>\n",
              "      <td>The Literal Explanation + BERT method's nDCG@10 scores on both datasets in Table 2 are 0.529 for the TREC dataset and 0.602 for the NTCIR dataset.</td>\n",
              "      <td>The Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2 are 0.529% for the TREC dataset and 0.340 for the NTCIR dataset.</td>\n",
              "      <td>The Literal Explanation + BERT method's ndcg@10 scores on the TREC dataset and the NTCIR dataset in table 2 are 0.529% and 0.340%, respectively.</td>\n",
              "      <td>The Literal Explanation + BERT method's nDCG@10 score on the TREC dataset is 0.529% and on the NTCIR dataset is 0.340.</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>What impact do natural language explanations (NLEs) have on improving the calibration and overall effectiveness of these models in document ranking tasks?</th>\n",
              "      <td>NLEs lead to better calibrated neural rankers while maintaining or even boosting ranking performance in most scenarios</td>\n",
              "      <td>Natural language explanations (NLEs) significantly enhance the scale calibration of neural rankers, often maintaining or even boosting ranking performance in most scenarios. They provide valuable insights for document differentiation and lead to statistically significant improvements in scale calibration, resulting in lower calibration error values compared to other calibration methods. Additi...</td>\n",
              "      <td>Natural language explanations (NLEs) significantly enhance the scale calibration of neural rankers, often maintaining or even boosting ranking performance in most scenarios. They provide valuable insights for document differentiation and lead to statistically significant improvements in scale calibration, resulting in lower calibration error values compared to other calibration methods. Additi...</td>\n",
              "      <td>Natural language explanations (NLEs) significantly enhance the scale calibration of neural rankers, often maintaining or even boosting ranking performance in most scenarios. They provide valuable insights for document differentiation and lead to statistically significant improvements in scale calibration, resulting in lower calibration error values compared to other calibration methods. Additi...</td>\n",
              "      <td>Natural language explanations (NLEs) significantly enhance the scale calibration of neural rankers, often maintaining or even boosting ranking performance in most scenarios. They provide valuable insights for document differentiation and lead to statistically significant improvements in scale calibration, resulting in lower calibration error values compared to other calibration methods. Additi...</td>\n",
              "      <td>Natural language explanations (NLEs) significantly enhance the scale calibration of neural rankers, often maintaining or even boosting ranking performance in most scenarios. They provide valuable insights for document differentiation and lead to statistically significant improvements in scale calibration, exhibiting lower error values compared to other calibration methods. Additionally, NLEs h...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>How do i interpret table 2's calibration and ranking scores?</th>\n",
              "      <td>Lower is better for calibration, higher is better for ranking</td>\n",
              "      <td>Interpreting Table 2's calibration and ranking scores involves understanding that lower values are better for calibration metrics such as CB-ECE, ECE, and MSE. The table compares the performance of different methods on two scale calibration datasets, TREC and NTCIR, using metrics like nDCG, nDCG@10, CB-ECE, ECE, and MSE. Significant improvements over a baseline method are marked with a dagger ...</td>\n",
              "      <td>Interpret the calibration and ranking scores in Table 2 by noting that lower values are better for calibration metrics such as CB-ECE, ECE, and MSE. Statistically significant improvements over the \"Platt Scaling monoBERT\" baseline are marked with a dagger symbol. The table presents the performance of different methods on two scale calibration datasets, TREC and NTCIR, with metrics like nDCG, n...</td>\n",
              "      <td>Interpret the calibration and ranking scores in Table 2 by noting that lower values are better for calibration metrics such as CB-ECE, ECE, and MSE. Look for statistically significant improvements over the \"Platt Scaling monoBERT\" baseline, which are marked with a dagger symbol (\u2020). Pay attention to the values in the columns for Ranking and Calibration metrics for different methods and dataset...</td>\n",
              "      <td>The calibration and ranking scores in Table 2 are evaluated based on different metrics such as nDCG, nDCG@10, CB-ECE, ECE, and MSE. Lower values are considered better for calibration metrics like CB-ECE, ECE, and MSE. The table compares the performance of baseline methods and new approaches on two scale calibration datasets, TREC and NTCIR. The results show how different methods perform in ter...</td>\n",
              "      <td>Interpret the calibration and ranking scores in Table 2 by noting that lower values are better for calibration metrics such as CB-ECE, ECE, and MSE. Look for statistically significant improvements over the \"Platt Scaling monoBERT\" baseline, which are marked with a dagger symbol (\u2020). Pay attention to the values in the columns for Ranking and Calibration metrics for different methods and dataset...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>What are the weights of \"Uncalibrated monoBERT\" tuned on?</th>\n",
              "      <td>MSMarco</td>\n",
              "      <td>The weights of \"Uncalibrated monoBERT\" are fine-tuned on MS MARCO.</td>\n",
              "      <td>The weights of \"Uncalibrated monoBERT\" are fine-tuned on MS MARCO.</td>\n",
              "      <td>The weights of \"Uncalibrated monoBERT\" are fine-tuned on MS MARCO.</td>\n",
              "      <td>The weights of \"Uncalibrated monoBERT\" are fine-tuned on MS MARCO.</td>\n",
              "      <td>The weights of \"Uncalibrated monoBERT\" are fine-tuned on MS MARCO.</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>What category was used to build and train literal explanation + BERT? what does this category mean?</th>\n",
              "      <td>Category F: training nle-based neural rankers on calibration data.</td>\n",
              "      <td>Category F was used to build and train the literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. Specifically, in the scenario of the literal explanation approach, where each input is represented with two meta NLEs (one for relevance an...</td>\n",
              "      <td>Category F was used to build and train literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. The approach includes generating and aggregating natural language explanations for query-document pairs, and then using these explanations to r...</td>\n",
              "      <td>Category F was used to build and train literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. For the literal explanation approach, this category involves generating and aggregating natural language explanations for query-document pairs ...</td>\n",
              "      <td>Category F was used to build and train the literal explanation + BERT model. This category involves training NLE-based neural rankers on calibration data. In this context, it means that the model was fine-tuned to process meta natural language explanations (NLEs) for query-document pairs and generate scale-calibrated ranking scores.</td>\n",
              "      <td>Category F was used to build and train the \"Literal Explanation + BERT\" method. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and generate scale-calibrated ranking scores. The literal explanation approach utilizes natural language explanations for query-document pairs to enhance the ranking performan...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>Is the 'trec-dl' in table 1 the same as the 'trec' in table 2?</th>\n",
              "      <td>Yes</td>\n",
              "      <td>Yes, the 'trec-dl' in Table 1 is the same as the 'TREC' in Table 2.</td>\n",
              "      <td>Yes, the 'trec-dl' in Table 1 is the same dataset as the 'TREC' dataset mentioned in Table 2.</td>\n",
              "      <td>Yes, the 'trec-dl' in table 1 is the same as the 'TREC' in table 2.</td>\n",
              "      <td>Yes, the 'trec-dl' in Table 1 is the same as the 'TREC' in Table 2.</td>\n",
              "      <td>No, the 'trec-dl' in table 1 is not the same as the 'trec' in table 2.</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "    <div class=\"colab-df-buttons\">\n",
              "\n",
              "  <div class=\"colab-df-container\">\n",
              "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-7391e708-4337-435c-8749-64206a2f8d63')\"\n",
              "            title=\"Convert this dataframe to an interactive table.\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
              "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
              "  </svg>\n",
              "    </button>\n",
              "\n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    .colab-df-buttons div {\n",
              "      margin-bottom: 4px;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "    <script>\n",
              "      const buttonEl =\n",
              "        document.querySelector('#df-7391e708-4337-435c-8749-64206a2f8d63 button.colab-df-convert');\n",
              "      buttonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "      async function convertToInteractive(key) {\n",
              "        const element = document.querySelector('#df-7391e708-4337-435c-8749-64206a2f8d63');\n",
              "        const dataTable =\n",
              "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                    [key], {});\n",
              "        if (!dataTable) return;\n",
              "\n",
              "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "          + ' to learn more about interactive tables.';\n",
              "        element.innerHTML = '';\n",
              "        dataTable['output_type'] = 'display_data';\n",
              "        await google.colab.output.renderOutput(dataTable, element);\n",
              "        const docLink = document.createElement('div');\n",
              "        docLink.innerHTML = docLinkHtml;\n",
              "        element.appendChild(docLink);\n",
              "      }\n",
              "    </script>\n",
              "  </div>\n",
              "\n",
              "\n",
              "<div id=\"df-67fb2798-662b-4b7a-82b4-636019f40033\">\n",
              "  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-67fb2798-662b-4b7a-82b4-636019f40033')\"\n",
              "            title=\"Suggest charts\"\n",
              "            style=\"display:none;\">\n",
              "\n",
              "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "     width=\"24px\">\n",
              "    <g>\n",
              "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
              "    </g>\n",
              "</svg>\n",
              "  </button>\n",
              "\n",
              "<style>\n",
              "  .colab-df-quickchart {\n",
              "      --bg-color: #E8F0FE;\n",
              "      --fill-color: #1967D2;\n",
              "      --hover-bg-color: #E2EBFA;\n",
              "      --hover-fill-color: #174EA6;\n",
              "      --disabled-fill-color: #AAA;\n",
              "      --disabled-bg-color: #DDD;\n",
              "  }\n",
              "\n",
              "  [theme=dark] .colab-df-quickchart {\n",
              "      --bg-color: #3B4455;\n",
              "      --fill-color: #D2E3FC;\n",
              "      --hover-bg-color: #434B5C;\n",
              "      --hover-fill-color: #FFFFFF;\n",
              "      --disabled-bg-color: #3B4455;\n",
              "      --disabled-fill-color: #666;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart {\n",
              "    background-color: var(--bg-color);\n",
              "    border: none;\n",
              "    border-radius: 50%;\n",
              "    cursor: pointer;\n",
              "    display: none;\n",
              "    fill: var(--fill-color);\n",
              "    height: 32px;\n",
              "    padding: 0;\n",
              "    width: 32px;\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart:hover {\n",
              "    background-color: var(--hover-bg-color);\n",
              "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "    fill: var(--button-hover-fill-color);\n",
              "  }\n",
              "\n",
              "  .colab-df-quickchart-complete:disabled,\n",
              "  .colab-df-quickchart-complete:disabled:hover {\n",
              "    background-color: var(--disabled-bg-color);\n",
              "    fill: var(--disabled-fill-color);\n",
              "    box-shadow: none;\n",
              "  }\n",
              "\n",
              "  .colab-df-spinner {\n",
              "    border: 2px solid var(--fill-color);\n",
              "    border-color: transparent;\n",
              "    border-bottom-color: var(--fill-color);\n",
              "    animation:\n",
              "      spin 1s steps(1) infinite;\n",
              "  }\n",
              "\n",
              "  @keyframes spin {\n",
              "    0% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "      border-left-color: var(--fill-color);\n",
              "    }\n",
              "    20% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    30% {\n",
              "      border-color: transparent;\n",
              "      border-left-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    40% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-top-color: var(--fill-color);\n",
              "    }\n",
              "    60% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "    }\n",
              "    80% {\n",
              "      border-color: transparent;\n",
              "      border-right-color: var(--fill-color);\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "    90% {\n",
              "      border-color: transparent;\n",
              "      border-bottom-color: var(--fill-color);\n",
              "    }\n",
              "  }\n",
              "</style>\n",
              "\n",
              "  <script>\n",
              "    async function quickchart(key) {\n",
              "      const quickchartButtonEl =\n",
              "        document.querySelector('#' + key + ' button');\n",
              "      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
              "      quickchartButtonEl.classList.add('colab-df-spinner');\n",
              "      try {\n",
              "        const charts = await google.colab.kernel.invokeFunction(\n",
              "            'suggestCharts', [key], {});\n",
              "      } catch (error) {\n",
              "        console.error('Error during call to suggestCharts:', error);\n",
              "      }\n",
              "      quickchartButtonEl.classList.remove('colab-df-spinner');\n",
              "      quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
              "    }\n",
              "    (() => {\n",
              "      let quickchartButtonEl =\n",
              "        document.querySelector('#df-67fb2798-662b-4b7a-82b4-636019f40033 button');\n",
              "      quickchartButtonEl.style.display =\n",
              "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "    })();\n",
              "  </script>\n",
              "</div>\n",
              "    </div>\n",
              "  </div>\n"
            ],
            "text/plain": [
              "                                                                                                                                                                                                                                                                                              ANSWER  \\\n",
              "How does the average query length compare to the average document length in table 1?                                                                        The average query length is shorter in TREC (8) than it is in NTCIR (22). The average doc length is also shorter in TREC (70.9 vs 493.2)   \n",
              "What are the Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2?                                                                                                                                                                              TREC: 0.529; NTCIR: 0.340.   \n",
              "What impact do natural language explanations (NLEs) have on improving the calibration and overall effectiveness of these models in document ranking tasks?                    NLEs lead to better calibrated neural rankers while maintaining or even boosting ranking performance in most scenarios   \n",
              "How do i interpret table 2's calibration and ranking scores?                                                                                                                                                                           Lower is better for calibration, higher is better for ranking   \n",
              "What are the weights of \"Uncalibrated monoBERT\" tuned on?                                                                                                                                                                                                                                    MSMarco   \n",
              "What category was used to build and train literal explanation + BERT? what does this category mean?                                                                                                                               Category F: training nle-based neural rankers on calibration data.   \n",
              "Is the 'trec-dl' in table 1 the same as the 'trec' in table 2?                                                                                                                                                                                                                                   Yes   \n",
              "\n",
              "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    control  \\\n",
              "How does the average query length compare to the average document length in table 1?                                                                                                                                                                                                                                                                                                                                                                                                The average query length is shorter compared to the average document length in Table 1.   \n",
              "What are the Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2?                                                                                                                                                                                                                                                                                                                                                                                                                                                                0.529 and 0.534   \n",
              "What impact do natural language explanations (NLEs) have on improving the calibration and overall effectiveness of these models in document ranking tasks?  Natural language explanations (NLEs) significantly enhance the scale calibration of neural rankers, often maintaining or even boosting ranking performance in most scenarios. They provide valuable insights for document differentiation and lead to statistically significant improvements in scale calibration, resulting in lower calibration error values compared to other calibration methods. Additi...   \n",
              "How do i interpret table 2's calibration and ranking scores?                                                                                                Interpreting Table 2's calibration and ranking scores involves understanding that lower values are better for calibration metrics such as CB-ECE, ECE, and MSE. The table compares the performance of different methods on two scale calibration datasets, TREC and NTCIR, using metrics like nDCG, nDCG@10, CB-ECE, ECE, and MSE. Significant improvements over a baseline method are marked with a dagger ...   \n",
              "What are the weights of \"Uncalibrated monoBERT\" tuned on?                                                                                                                                                                                                                                                                                                                                                                                                                                                The weights of \"Uncalibrated monoBERT\" are fine-tuned on MS MARCO.   \n",
              "What category was used to build and train literal explanation + BERT? what does this category mean?                                                         Category F was used to build and train the literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. Specifically, in the scenario of the literal explanation approach, where each input is represented with two meta NLEs (one for relevance an...   \n",
              "Is the 'trec-dl' in table 1 the same as the 'trec' in table 2?                                                                                                                                                                                                                                                                                                                                                                                                                                          Yes, the 'trec-dl' in Table 1 is the same as the 'TREC' in Table 2.   \n",
              "\n",
              "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         v1  \\\n",
              "How does the average query length compare to the average document length in table 1?                                                                                                                                                                                                                                                                                                                                                                                                The average query length is shorter compared to the average document length in table 1.   \n",
              "What are the Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2?                                                                                                                                                                                                                                                                                                                             The Literal Explanation + BERT method's nDCG@10 scores on both datasets in Table 2 are 0.529 for the TREC dataset and 0.602 for the NTCIR dataset.   \n",
              "What impact do natural language explanations (NLEs) have on improving the calibration and overall effectiveness of these models in document ranking tasks?  Natural language explanations (NLEs) significantly enhance the scale calibration of neural rankers, often maintaining or even boosting ranking performance in most scenarios. They provide valuable insights for document differentiation and lead to statistically significant improvements in scale calibration, resulting in lower calibration error values compared to other calibration methods. Additi...   \n",
              "How do i interpret table 2's calibration and ranking scores?                                                                                                Interpret the calibration and ranking scores in Table 2 by noting that lower values are better for calibration metrics such as CB-ECE, ECE, and MSE. Statistically significant improvements over the \"Platt Scaling monoBERT\" baseline are marked with a dagger symbol. The table presents the performance of different methods on two scale calibration datasets, TREC and NTCIR, with metrics like nDCG, n...   \n",
              "What are the weights of \"Uncalibrated monoBERT\" tuned on?                                                                                                                                                                                                                                                                                                                                                                                                                                                The weights of \"Uncalibrated monoBERT\" are fine-tuned on MS MARCO.   \n",
              "What category was used to build and train literal explanation + BERT? what does this category mean?                                                         Category F was used to build and train literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. The approach includes generating and aggregating natural language explanations for query-document pairs, and then using these explanations to r...   \n",
              "Is the 'trec-dl' in table 1 the same as the 'trec' in table 2?                                                                                                                                                                                                                                                                                                                                                                                                                Yes, the 'trec-dl' in Table 1 is the same dataset as the 'TREC' dataset mentioned in Table 2.   \n",
              "\n",
              "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         v2  \\\n",
              "How does the average query length compare to the average document length in table 1?                                                                                                                                                                                                                                                                                                                                                                                           The average query length is significantly lower than the average document length in table 1.   \n",
              "What are the Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2?                                                                                                                                                                                                                                                                                                                            The Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2 are 0.529% for the TREC dataset and 0.340 for the NTCIR dataset.   \n",
              "What impact do natural language explanations (NLEs) have on improving the calibration and overall effectiveness of these models in document ranking tasks?  Natural language explanations (NLEs) significantly enhance the scale calibration of neural rankers, often maintaining or even boosting ranking performance in most scenarios. They provide valuable insights for document differentiation and lead to statistically significant improvements in scale calibration, resulting in lower calibration error values compared to other calibration methods. Additi...   \n",
              "How do i interpret table 2's calibration and ranking scores?                                                                                                Interpret the calibration and ranking scores in Table 2 by noting that lower values are better for calibration metrics such as CB-ECE, ECE, and MSE. Look for statistically significant improvements over the \"Platt Scaling monoBERT\" baseline, which are marked with a dagger symbol (\u2020). Pay attention to the values in the columns for Ranking and Calibration metrics for different methods and dataset...   \n",
              "What are the weights of \"Uncalibrated monoBERT\" tuned on?                                                                                                                                                                                                                                                                                                                                                                                                                                                The weights of \"Uncalibrated monoBERT\" are fine-tuned on MS MARCO.   \n",
              "What category was used to build and train literal explanation + BERT? what does this category mean?                                                         Category F was used to build and train literal explanation + BERT. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and produce scale-calibrated ranking scores. For the literal explanation approach, this category involves generating and aggregating natural language explanations for query-document pairs ...   \n",
              "Is the 'trec-dl' in table 1 the same as the 'trec' in table 2?                                                                                                                                                                                                                                                                                                                                                                                                                                          Yes, the 'trec-dl' in table 1 is the same as the 'TREC' in table 2.   \n",
              "\n",
              "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         v3  \\\n",
              "How does the average query length compare to the average document length in table 1?                                                                                                                                                                                                                                                                                                                                                                                                       The average query length is shorter than the average document length in Table 1.   \n",
              "What are the Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2?                                                                                                                                                                                                                                                                                                                               The Literal Explanation + BERT method's ndcg@10 scores on the TREC dataset and the NTCIR dataset in table 2 are 0.529% and 0.340%, respectively.   \n",
              "What impact do natural language explanations (NLEs) have on improving the calibration and overall effectiveness of these models in document ranking tasks?  Natural language explanations (NLEs) significantly enhance the scale calibration of neural rankers, often maintaining or even boosting ranking performance in most scenarios. They provide valuable insights for document differentiation and lead to statistically significant improvements in scale calibration, resulting in lower calibration error values compared to other calibration methods. Additi...   \n",
              "How do i interpret table 2's calibration and ranking scores?                                                                                                The calibration and ranking scores in Table 2 are evaluated based on different metrics such as nDCG, nDCG@10, CB-ECE, ECE, and MSE. Lower values are considered better for calibration metrics like CB-ECE, ECE, and MSE. The table compares the performance of baseline methods and new approaches on two scale calibration datasets, TREC and NTCIR. The results show how different methods perform in ter...   \n",
              "What are the weights of \"Uncalibrated monoBERT\" tuned on?                                                                                                                                                                                                                                                                                                                                                                                                                                                The weights of \"Uncalibrated monoBERT\" are fine-tuned on MS MARCO.   \n",
              "What category was used to build and train literal explanation + BERT? what does this category mean?                                                                                                                          Category F was used to build and train the literal explanation + BERT model. This category involves training NLE-based neural rankers on calibration data. In this context, it means that the model was fine-tuned to process meta natural language explanations (NLEs) for query-document pairs and generate scale-calibrated ranking scores.   \n",
              "Is the 'trec-dl' in table 1 the same as the 'trec' in table 2?                                                                                                                                                                                                                                                                                                                                                                                                                                          Yes, the 'trec-dl' in Table 1 is the same as the 'TREC' in Table 2.   \n",
              "\n",
              "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         v4  \n",
              "How does the average query length compare to the average document length in table 1?                                                                                                                                                                                                                                                                                                                                                                                           The average query length is significantly lower than the average document length in table 1.  \n",
              "What are the Literal Explanation + BERT method's ndcg@10 scores on both datasets in table 2?                                                                                                                                                                                                                                                                                                                                                         The Literal Explanation + BERT method's nDCG@10 score on the TREC dataset is 0.529% and on the NTCIR dataset is 0.340.  \n",
              "What impact do natural language explanations (NLEs) have on improving the calibration and overall effectiveness of these models in document ranking tasks?  Natural language explanations (NLEs) significantly enhance the scale calibration of neural rankers, often maintaining or even boosting ranking performance in most scenarios. They provide valuable insights for document differentiation and lead to statistically significant improvements in scale calibration, exhibiting lower error values compared to other calibration methods. Additionally, NLEs h...  \n",
              "How do i interpret table 2's calibration and ranking scores?                                                                                                Interpret the calibration and ranking scores in Table 2 by noting that lower values are better for calibration metrics such as CB-ECE, ECE, and MSE. Look for statistically significant improvements over the \"Platt Scaling monoBERT\" baseline, which are marked with a dagger symbol (\u2020). Pay attention to the values in the columns for Ranking and Calibration metrics for different methods and dataset...  \n",
              "What are the weights of \"Uncalibrated monoBERT\" tuned on?                                                                                                                                                                                                                                                                                                                                                                                                                                                The weights of \"Uncalibrated monoBERT\" are fine-tuned on MS MARCO.  \n",
              "What category was used to build and train literal explanation + BERT? what does this category mean?                                                         Category F was used to build and train the \"Literal Explanation + BERT\" method. This category involves training NLE-based neural rankers on calibration data. In this method, a BERT model is finetuned to process meta NLEs and generate scale-calibrated ranking scores. The literal explanation approach utilizes natural language explanations for query-document pairs to enhance the ranking performan...  \n",
              "Is the 'trec-dl' in table 1 the same as the 'trec' in table 2?                                                                                                                                                                                                                                                                                                                                                                                                                                       No, the 'trec-dl' in table 1 is not the same as the 'trec' in table 2.  "
            ]
          },
          "execution_count": 113,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "# Add variant 4's responses to `exp_results` dataframe:\n",
        "v4_responses = [one_v4, two_v4, three_v4, four_v4, five_v4, six_v4, seven_v4]\n",
        "\n",
        "exp_results['v4'] = v4_responses\n",
        "\n",
        "exp_results"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 98,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "fcR3rYODtiJ-",
        "outputId": "ef02992a-e6ab-4b14-eae0-7cb7e9646947"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <style>\n",
              "    pre {\n",
              "        white-space: pre-wrap;\n",
              "    }\n",
              "  </style>\n",
              "  "
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "# The end!"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "provenance": [],
      "toc_visible": true
    },
    "kernelspec": {
      "display_name": "Python 3 (ipykernel)",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.11.5"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 4
}